Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.util;

import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;

/**
* Validates language codes against ISO 639 standards.
* <p>
* Accepts:
* <ul>
* <li>ISO 639-1 two-letter language codes
* (e.g., {@code "en"}, {@code "de"})</li>
* <li>ISO 639-2/3 three-letter language codes
* (e.g., {@code "eng"}, {@code "deu"})</li>
* <li>The special code {@code "x-unspecified"}
* used internally by OpenNLP</li>
* </ul>
* <p>
* Valid codes are derived from {@link Locale#availableLocales()}
* plus additional ISO 639-2 bibliographic codes and
* {@code "und"} (undetermined).
*
* @see <a href="https://iso639-3.sil.org/">ISO 639-3</a>
*/
public final class LanguageCodeValidator {

private static final String X_UNSPECIFIED = "x-unspecified";

private static final Set<String> VALID_CODES;

static {
VALID_CODES = Locale.availableLocales()
.flatMap(loc -> {
final Set<String> codes = new HashSet<>();
final String lang = loc.getLanguage();
if (!lang.isEmpty()) {
codes.add(lang);
}
try {
final String iso3 = loc.getISO3Language();
if (!iso3.isEmpty()) {
codes.add(iso3);
}
} catch (Exception ignored) {
// Some locales may not have a 3-letter equivalent
}
return codes.stream();
})
.collect(Collectors.toCollection(HashSet::new));

// ISO 639-2 bibliographic codes not returned by Locale
VALID_CODES.add("dut"); // Dutch (bibliographic)
VALID_CODES.add("fre"); // French (bibliographic)
VALID_CODES.add("ger"); // German (bibliographic)

// ISO 639-3 special code for undetermined language
VALID_CODES.add("und");

// OpenNLP-specific special code
VALID_CODES.add(X_UNSPECIFIED);
}

private LanguageCodeValidator() {
// utility class, not intended to be instantiated
}

/**
* Checks whether the given language code is a valid ISO 639 code.
*
* @param languageCode The language code to check.
* Must not be {@code null}.
* @return {@code true} if the code is valid,
* {@code false} otherwise.
* @throws IllegalArgumentException if {@code languageCode}
* is {@code null}.
*/
public static boolean isValid(String languageCode) {
if (languageCode == null) {
throw new IllegalArgumentException(
"languageCode must not be null");
}
return VALID_CODES.contains(languageCode);
}

/**
* Validates the given language code and throws an
* {@link IllegalArgumentException} if it is not a recognized
* ISO 639 language code.
*
* @param languageCode The language code to validate.
* Must not be {@code null}.
* @throws IllegalArgumentException if the code is not a valid
* ISO 639 language code or is {@code null}.
*/
public static void validateLanguageCode(String languageCode) {
if (!isValid(languageCode)) {
throw new IllegalArgumentException(
"Unknown language code '" + languageCode
+ "', must be a valid ISO 639 code!");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,14 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import opennlp.tools.commons.Internal;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.LanguageCodeValidator;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
Expand Down Expand Up @@ -278,12 +275,10 @@ public static Double getDoubleParameter(String param, String[] args) {
}

public static void checkLanguageCode(String code) {
List<String> languageCodes = new ArrayList<>(Arrays.asList(Locale.getISOLanguages()));
languageCodes.add("x-unspecified");

if (!languageCodes.contains(code)) {
throw new TerminateToolException(1, "Unknown language code " + code + ", " +
"must be an ISO 639 code!");
if (!LanguageCodeValidator.isValid(code)) {
throw new TerminateToolException(1,
"Unknown language code " + code
+ ", must be an ISO 639 code!");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.LanguageCodeValidator;
import opennlp.tools.util.Version;
import opennlp.tools.util.ext.ExtensionLoader;

Expand Down Expand Up @@ -108,6 +109,7 @@ protected BaseModel(String componentName, String languageCode,
this(componentName, false);

Objects.requireNonNull(languageCode, "languageCode must not be null");
LanguageCodeValidator.validateLanguageCode(languageCode);

createBaseArtifactSerializers(artifactSerializers);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.util;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

class LanguageCodeValidatorTest {

@Test
void testValidIso639_1Codes() {
Assertions.assertTrue(LanguageCodeValidator.isValid("en"));
Assertions.assertTrue(LanguageCodeValidator.isValid("de"));
Assertions.assertTrue(LanguageCodeValidator.isValid("fr"));
Assertions.assertTrue(LanguageCodeValidator.isValid("es"));
Assertions.assertTrue(LanguageCodeValidator.isValid("pt"));
Assertions.assertTrue(LanguageCodeValidator.isValid("it"));
Assertions.assertTrue(LanguageCodeValidator.isValid("nl"));
Assertions.assertTrue(LanguageCodeValidator.isValid("th"));
Assertions.assertTrue(LanguageCodeValidator.isValid("ja"));
Assertions.assertTrue(LanguageCodeValidator.isValid("pl"));
}

@Test
void testValidIso639_3TerminologicalCodes() {
Assertions.assertTrue(LanguageCodeValidator.isValid("eng"));
Assertions.assertTrue(LanguageCodeValidator.isValid("deu"));
Assertions.assertTrue(LanguageCodeValidator.isValid("fra"));
Assertions.assertTrue(LanguageCodeValidator.isValid("spa"));
Assertions.assertTrue(LanguageCodeValidator.isValid("por"));
Assertions.assertTrue(LanguageCodeValidator.isValid("ita"));
Assertions.assertTrue(LanguageCodeValidator.isValid("nld"));
Assertions.assertTrue(LanguageCodeValidator.isValid("tha"));
Assertions.assertTrue(LanguageCodeValidator.isValid("jpn"));
Assertions.assertTrue(LanguageCodeValidator.isValid("pol"));
}

@Test
void testValidIso639_2BibliographicCodes() {
Assertions.assertTrue(LanguageCodeValidator.isValid("dut"));
Assertions.assertTrue(LanguageCodeValidator.isValid("fre"));
Assertions.assertTrue(LanguageCodeValidator.isValid("ger"));
}

@Test
void testUndeterminedCode() {
Assertions.assertTrue(LanguageCodeValidator.isValid("und"));
}

@Test
void testSpecialCodes() {
Assertions.assertTrue(LanguageCodeValidator.isValid("x-unspecified"));
}

@Test
void testInvalidCodes() {
Assertions.assertFalse(LanguageCodeValidator.isValid(""));
Assertions.assertFalse(LanguageCodeValidator.isValid("xyz123"));
Assertions.assertFalse(LanguageCodeValidator.isValid("invalid"));
Assertions.assertFalse(LanguageCodeValidator.isValid("EN"));
Assertions.assertFalse(LanguageCodeValidator.isValid("ENG"));
Assertions.assertFalse(LanguageCodeValidator.isValid("123"));
Assertions.assertFalse(LanguageCodeValidator.isValid("e"));
Assertions.assertFalse(LanguageCodeValidator.isValid("en-US"));
Assertions.assertFalse(LanguageCodeValidator.isValid("abcd"));
}

@Test
void testInvalidTwoLetterCode() {
Assertions.assertFalse(LanguageCodeValidator.isValid("xx"));
Assertions.assertFalse(LanguageCodeValidator.isValid("zz"));
}

@Test
void testInvalidThreeLetterCode() {
Assertions.assertFalse(LanguageCodeValidator.isValid("xyz"));
Assertions.assertFalse(LanguageCodeValidator.isValid("abc"));
Assertions.assertFalse(LanguageCodeValidator.isValid("zzz"));
}

@Test
void testNullCode() {
Assertions.assertThrows(IllegalArgumentException.class,
() -> LanguageCodeValidator.isValid(null));
}

@Test
void testValidateThrowsForInvalidCode() {
IllegalArgumentException ex = Assertions.assertThrows(IllegalArgumentException.class,
() -> LanguageCodeValidator.validateLanguageCode("invalid_code"));
Assertions.assertTrue(ex.getMessage().contains("invalid_code"));
}

@Test
void testValidatePassesForValidCode() {
Assertions.assertDoesNotThrow(
() -> LanguageCodeValidator.validateLanguageCode("en"));
Assertions.assertDoesNotThrow(
() -> LanguageCodeValidator.validateLanguageCode("eng"));
Assertions.assertDoesNotThrow(
() -> LanguageCodeValidator.validateLanguageCode("dut"));
Assertions.assertDoesNotThrow(
() -> LanguageCodeValidator.validateLanguageCode("und"));
Assertions.assertDoesNotThrow(
() -> LanguageCodeValidator.validateLanguageCode("x-unspecified"));
}
}