Skip to content

Commit 118c8cd

Browse files
CLDR-18233 Remove language families from likely subtags (#4598)
1 parent 1ad583f commit 118c8cd

File tree

5 files changed

+38
-134
lines changed

5 files changed

+38
-134
lines changed

common/supplemental/likelySubtags.xml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ not be patched by hand, as any changes made in that fashion may be lost.
347347
<likelySubtag from="kdh" to="kdh_Latn_TG"/> <!--Tem‧?‧? ➡ Tem‧Latin‧Togo-->
348348
<likelySubtag from="kdt" to="kdt_Thai_TH"/> <!--Kuy‧?‧? ➡ Kuy‧Thai‧Thailand-->
349349
<likelySubtag from="kea" to="kea_Latn_CV"/> <!--Kabuverdianu‧?‧? ➡ Kabuverdianu‧Latin‧Cape Verde-->
350-
<likelySubtag from="kek" to="kek_Latn_GT"/> <!--Kekchí‧?‧? ➡ Kekchí‧Latin‧Guatemala-->
350+
<likelySubtag from="kek" to="kek_Latn_GT"/> <!--Qʼeqchiʼ‧?‧? ➡ Qʼeqchiʼ‧Latin‧Guatemala-->
351351
<likelySubtag from="ken" to="ken_Latn_CM"/> <!--Kenyang‧?‧? ➡ Kenyang‧Latin‧Cameroon-->
352352
<likelySubtag from="kfo" to="kfo_Latn_CI"/> <!--Koro‧?‧? ➡ Koro‧Latin‧Côte d’Ivoire-->
353353
<likelySubtag from="kfr" to="kfr_Deva_IN"/> <!--Kachhi‧?‧? ➡ Kachhi‧Devanagari‧India-->
@@ -890,7 +890,6 @@ not be patched by hand, as any changes made in that fashion may be lost.
890890
<likelySubtag from="zh_Bopo" to="zh_Bopo_TW"/> <!--Chinese‧Bopomofo‧? ➡ Chinese‧Bopomofo‧Taiwan-->
891891
<likelySubtag from="zh_Hanb" to="zh_Hanb_TW"/> <!--Chinese‧Han with Bopomofo‧? ➡ Chinese‧Han with Bopomofo‧Taiwan-->
892892
<likelySubtag from="zh_Hant" to="zh_Hant_TW"/> <!--Chinese‧Traditional‧? ➡ Chinese‧Traditional‧Taiwan-->
893-
<likelySubtag from="zhx" to="zhx_Nshu_CN"/> <!--Chinese (family)‧?‧? ➡ Chinese (family)‧Nüshu‧China-->
894893
<likelySubtag from="zkt" to="zkt_Kits_CN"/> <!--Kitan‧?‧? ➡ Kitan‧Khitan small script‧China-->
895894
<likelySubtag from="zlm" to="zlm_Latn_MY"/> <!--Malay (individual language)‧?‧? ➡ Malay (individual language)‧Latin‧Malaysia-->
896895
<likelySubtag from="zmi" to="zmi_Latn_MY"/> <!--Negeri Sembilan Malay‧?‧? ➡ Negeri Sembilan Malay‧Latin‧Malaysia-->
@@ -1296,7 +1295,6 @@ not be patched by hand, as any changes made in that fashion may be lost.
12961295
<likelySubtag from="und_Newa" to="new_Newa_NP"/> <!--?‧Newa‧? ➡ Newari‧Newa‧Nepal-->
12971296
<likelySubtag from="und_Nkoo" to="man_Nkoo_GN"/> <!--?‧N’Ko‧? ➡ Mandingo‧N’Ko‧Guinea-->
12981297
<likelySubtag from="und_Nkoo_ML" to="bm_Nkoo_ML"/> <!--?‧N’Ko‧Mali ➡ Bambara‧N’Ko‧Mali-->
1299-
<likelySubtag from="und_Nshu" to="zhx_Nshu_CN"/> <!--?‧Nüshu‧? ➡ Chinese (family)‧Nüshu‧China-->
13001298
<likelySubtag from="und_Ogam" to="sga_Ogam_IE"/> <!--?‧Ogham‧? ➡ Old Irish‧Ogham‧Ireland-->
13011299
<likelySubtag from="und_Olck" to="sat_Olck_IN"/> <!--?‧Ol Chiki‧? ➡ Santali‧Ol Chiki‧India-->
13021300
<likelySubtag from="und_Onao" to="unr_Onao_IN"/> <!--?‧Ol Onal‧? ➡ Mundari‧Ol Onal‧India-->
@@ -5801,7 +5799,6 @@ not be patched by hand, as any changes made in that fashion may be lost.
58015799
<likelySubtag from="pps" to="pps_Latn_MX" origin="sil1"/> <!--San Luís Temalacayuca Popoloca‧?‧? ➡ San Luís Temalacayuca Popoloca‧Latin‧Mexico-->
58025800
<likelySubtag from="ppt" to="ppt_Latn_PG" origin="sil1"/> <!--Pare‧?‧? ➡ Pare‧Latin‧Papua New Guinea-->
58035801
<likelySubtag from="pqa" to="pqa_Latn_NG" origin="sil1"/> <!--Pa'a‧?‧? ➡ Pa'a‧Latin‧Nigeria-->
5804-
<likelySubtag from="pra" to="pra_Khar_PK" origin="sil1"/> <!--Prakrit languages‧?‧? ➡ Prakrit languages‧Kharoshthi‧Pakistan-->
58055802
<likelySubtag from="prc" to="prc_Arab_AF" origin="sil1"/> <!--Parachi‧?‧? ➡ Parachi‧Arabic‧Afghanistan-->
58065803
<likelySubtag from="pre" to="pre_Latn_ST" origin="sil1"/> <!--Principense‧?‧? ➡ Principense‧Latin‧São Tomé & Príncipe-->
58075804
<likelySubtag from="prf" to="prf_Latn_PH" origin="sil1"/> <!--Paranan‧?‧? ➡ Paranan‧Latin‧Philippines-->

tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java

Lines changed: 12 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import com.google.common.collect.ComparisonChain;
66
import com.google.common.collect.ImmutableList;
77
import com.google.common.collect.ImmutableMap;
8-
import com.google.common.collect.ImmutableSet;
98
import com.google.common.collect.Sets;
109
import com.ibm.icu.impl.Relation;
1110
import com.ibm.icu.impl.Row;
@@ -42,6 +41,7 @@
4241
import org.unicode.cldr.util.Containment;
4342
import org.unicode.cldr.util.Counter;
4443
import org.unicode.cldr.util.Factory;
44+
import org.unicode.cldr.util.Iso639Data;
4545
import org.unicode.cldr.util.LanguageTagParser;
4646
import org.unicode.cldr.util.LocaleNames;
4747
import org.unicode.cldr.util.LocaleScriptInfo;
@@ -83,13 +83,8 @@ public enum LocaleOverride {
8383
private static final Map<String, Status> SCRIPT_CODE_TO_STATUS =
8484
Validity.getInstance().getCodeToStatus(LstrType.script);
8585

86-
private static final String TEMP_UNKNOWN_REGION = "XZ";
87-
88-
private static final String DEBUG_ADD_KEY = "und_Latn_ZA";
89-
9086
private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000;
9187
private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20;
92-
private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000;
9388

9489
/** When a language is not official, scale it down. */
9590
private static final double UNOFFICIAL_SCALE_DOWN = 0.2;
@@ -153,8 +148,6 @@ private static Set<String> parse(String[] args) {
153148
private static boolean DEBUG;
154149
private static Map<String, LstrType> WATCH_PAIRS = null;
155150

156-
private static final boolean SHOW_OVERRIDES = true;
157-
158151
static final Map<String, LSRSource> silData = LangTagsData.getJsonData();
159152

160153
public static void main(String[] args) throws IOException {
@@ -217,7 +210,6 @@ public static void main(String[] args) throws IOException {
217210
throw new IllegalArgumentException();
218211
}
219212

220-
Set<String> newAdditions = new TreeSet<>();
221213
Set<String> newMissing = new TreeSet<>();
222214

223215
// Check against last version
@@ -291,12 +283,6 @@ public static void main(String[] args) throws IOException {
291283
System.out.println("Keeping macroregions used in cldr " + cldrContainerToLanguages);
292284
}
293285

294-
private static final List<String> KEEP_TARGETS =
295-
DROP_HARDCODED ? List.of() : List.of("und_Arab_PK", "und_Latn_ET");
296-
297-
private static final ImmutableSet<String> deprecatedISONotInLST =
298-
DROP_HARDCODED ? ImmutableSet.of() : ImmutableSet.of("scc", "scr");
299-
300286
/**
301287
* This is the simplest way to override, by supplying the max value. It gets a very low weight,
302288
* so doesn't override any stronger value.
@@ -469,7 +455,6 @@ public static void main(String[] args) throws IOException {
469455
{"rhg_Arab", "rhg_Arab_MM"},
470456
{"und_Arab_MM", "rhg_Arab_MM"},
471457
{"sd_IN", "sd_Deva_IN"}, // preferred in CLDR
472-
// { "sd_Deva", "sd_Deva_IN"},
473458
{"und_Cpmn", "und_Cpmn_CY"},
474459
{"oc_ES", "oc_Latn_ES"},
475460
{"os", "os_Cyrl_GE"},
@@ -517,15 +502,6 @@ public static void main(String[] args) throws IOException {
517502
{"ko", "Kore"}, // Korean (North Korea)
518503
{"ko_KR", "Kore"}, // Korean (North Korea)
519504
{"ja", "Jpan"}, // Special script for japan
520-
521-
// {"chk", "Latn"}, // Chuukese (Micronesia)
522-
// {"fil", "Latn"}, // Filipino (Philippines)"
523-
// {"pap", "Latn"}, // Papiamento (Netherlands Antilles)
524-
// {"pau", "Latn"}, // Palauan (Palau)
525-
// {"su", "Latn"}, // Sundanese (Indonesia)
526-
// {"tet", "Latn"}, // Tetum (East Timor)
527-
// {"tk", "Latn"}, // Turkmen (Turkmenistan)
528-
// {"ty", "Latn"}, // Tahitian (French Polynesia)
529505
// {LocaleNames.UND, "Latn"}, // Ultimate fallback
530506
};
531507

@@ -544,21 +520,6 @@ public static void main(String[] args) throws IOException {
544520
}
545521
}
546522

547-
private static Map<String, String> FALLBACK_SCRIPTS;
548-
549-
static {
550-
LanguageTagParser additionLtp = new LanguageTagParser();
551-
Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>();
552-
for (String addition : MAX_ADDITIONS) {
553-
additionLtp.set(addition);
554-
String lan = additionLtp.getLanguage();
555-
_FALLBACK_SCRIPTS.put(lan, additionLtp.getScript());
556-
}
557-
FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS);
558-
}
559-
560-
private static int errorCount;
561-
562523
/**
563524
* Debugging function that returns false if the flag is false, otherwise returns true if the
564525
* WATCH is null or the locales don't match the WATCH.
@@ -679,8 +640,6 @@ public static String getNameSafe(String oldValue) {
679640
private static OutputStyle OUTPUT_STYLE =
680641
OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML").toUpperCase());
681642

682-
private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_";
683-
684643
private static final Joiner JOIN_SPACE = Joiner.on(' ');
685644
private static final Joiner JOIN_UBAR = Joiner.on('_');
686645

@@ -728,21 +687,6 @@ private static Map<String, String> generatePopulationData(Map<String, String> to
728687

729688
if (data.getOfficialStatus() == OfficialStatus.unknown) {
730689
final String locale = writtenLanguage + "_" + region;
731-
// if (literatePopulation >= minimalLiteratePopulation) {
732-
// // ok, skip
733-
// } else if (literatePopulation >=
734-
// MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE
735-
// && cldrLocales.contains(locale)) {
736-
// // ok, skip
737-
// } else {
738-
// // if (SHOW_ADD)
739-
// // System.out.println("Skipping:\t" + writtenLanguage
740-
// + "\t" + region + "\t"
741-
// // + english.nameGetter().getName(locale)
742-
// // + "\t-- too small:\t" +
743-
// number.format(literatePopulation));
744-
// // continue;
745-
// }
746690
order *= UNOFFICIAL_SCALE_DOWN;
747691
if (watching(SHOW_POP, writtenLanguage))
748692
System.out.println(
@@ -800,22 +744,6 @@ private static Map<String, String> generatePopulationData(Map<String, String> to
800744
}
801745
}
802746

803-
// Old code for getting language to script, adding XZ, which converts to ZZ. Replaced by use
804-
// of SIL data
805-
806-
// for (Entry<String, Collection<String>> entry :
807-
// DeriveScripts.getLanguageToScript().asMap().entrySet()) {
808-
// String language = entry.getKey();
809-
// final Collection<String> values = entry.getValue();
810-
// if (values.size() != 1) {
811-
// continue; // skip, no either way
812-
// }
813-
// Set<R3<Double, String, String>> old = maxData.languages.get(language);
814-
// if (!maxData.languages.containsKey(language)) {
815-
// maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0);
816-
// }
817-
// }
818-
819747
// add others, with English default
820748
for (String region : otherTerritories) {
821749
if (!LocaleValidator.ALLOW_IN_LIKELY.isAllowed(LstrType.region, region, null, null)) {
@@ -846,10 +774,6 @@ private static Map<String, String> generatePopulationData(Map<String, String> to
846774
continue;
847775
}
848776

849-
if (deprecatedISONotInLST.contains(badLanguage)) {
850-
continue;
851-
}
852-
853777
if (LANGUAGE_CODE_TO_STATUS.get(badLanguage) != Validity.Status.regular) {
854778
if (!LocaleValidator.ALLOW_IN_LIKELY.isAllowed(
855779
LstrType.language, badLanguage, null, null)) {
@@ -1071,6 +995,10 @@ private static Map<String, String> generatePopulationData(Map<String, String> to
1071995
System.out.println(JOIN_LS.join("Failure in ScriptMetaData: " + ltp, errors));
1072996
continue;
1073997
}
998+
if (isLanguageCollection(likelyLanguage)) {
999+
// Dropping language collections
1000+
continue;
1001+
}
10741002
final String result = likelyLanguage + "_" + script + "_" + originCountry;
10751003
add("und_" + script, result, toMaximized, "S->LR•", LocaleOverride.KEEP_EXISTING);
10761004
add(likelyLanguage, result, toMaximized, "L->SR•", LocaleOverride.KEEP_EXISTING);
@@ -1758,7 +1686,7 @@ private static File printLikelySubtags(Map<String, String> fluffup) throws IOExc
17581686
for (Entry<String, LSRSource> entry : silData.entrySet()) {
17591687
CLDRLocale source = CLDRLocale.getInstance(entry.getKey());
17601688
String lang = source.getLanguage();
1761-
if (!fluffup.containsKey(lang)) {
1689+
if (!fluffup.containsKey(lang) && !isLanguageCollection(lang)) {
17621690
silMap.put(entry.getKey(), entry.getValue().getLsrString());
17631691
if (!entry.getValue().getSources().isEmpty()) {
17641692
silOrigins.put(entry.getKey(), entry.getValue().getSourceString());
@@ -1841,4 +1769,10 @@ public static void printLine(
18411769
}
18421770
}
18431771
}
1772+
1773+
// Check if the language code is a collection of languages (ISO 639-5). Otherwise its probably
1774+
// an individual one or maybe a macrolanguage.
1775+
private static Boolean isLanguageCollection(String language) {
1776+
return Iso639Data.getHierarchy(language) != null;
1777+
}
18441778
}

tools/cldr-code/src/main/java/org/unicode/cldr/util/Iso639Data.java

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ public class Iso639Data {
3434

3535
static Map<String, Scope> toScope;
3636

37-
static Map<String, List<String>> toHeirarchy;
37+
static Map<String, List<String>> toHierarchy;
3838

3939
static Map<String, Type> toType;
4040

@@ -246,12 +246,12 @@ public static Scope getScope(String languageSubtag) {
246246
return Scope.Individual;
247247
}
248248

249-
/** Returns the ISO 639-5 heirarchy if available, otherwise null. */
250-
public static List<String> getHeirarchy(String languageSubtag) {
251-
if (toHeirarchy == null) {
249+
/** Returns the ISO 639-5 hierarchy if available, otherwise null. */
250+
public static List<String> getHierarchy(String languageSubtag) {
251+
if (toHierarchy == null) {
252252
getData();
253253
}
254-
return toHeirarchy.get(languageSubtag);
254+
return toHierarchy.get(languageSubtag);
255255
}
256256

257257
public static Type getType(String languageSubtag) {
@@ -423,6 +423,7 @@ private static void getData() {
423423
// System.out.println("Size:\t" + toNames.size());
424424
in.close();
425425

426+
// TODO https://unicode-org.atlassian.net/browse/CLDR-18499 Find an up-to-date source
426427
in = CldrUtility.getUTF8Data("ISO-639-2_values_8bits.txt");
427428
// An alpha-3 (bibliographic) code,
428429
// an alpha-3 (terminologic) code (when given),
@@ -465,6 +466,8 @@ private static void getData() {
465466
toAlpha3.put(languageSubtag, alpha3);
466467
fromAlpha3.put(alpha3, languageSubtag);
467468
}
469+
// Warning: This is not always correct. Deprecated ISO 639-3 codes will also
470+
// appear here
468471
toScope.put(languageSubtag, Scope.Collection);
469472
toType.put(languageSubtag, Type.Special);
470473
toNames.putAll(languageSubtag, Arrays.asList(english));
@@ -473,7 +476,7 @@ private static void getData() {
473476
}
474477
in.close();
475478

476-
Map<String, String> toHeirarchyTemp = new TreeMap<>();
479+
Map<String, String> toHierarchyTemp = new TreeMap<>();
477480
in = CldrUtility.getUTF8Data("external/Iso639-5.html");
478481
String lastCode = null;
479482
int column = 0;
@@ -535,8 +538,8 @@ private static void getData() {
535538
lastCode = result.toString();
536539
break;
537540
case 5:
538-
String old = toHeirarchyTemp.get(lastCode);
539-
toHeirarchyTemp.put(
541+
String old = toHierarchyTemp.get(lastCode);
542+
toHierarchyTemp.put(
540543
lastCode,
541544
old == null || old.length() == 0
542545
? result.toString().trim()
@@ -571,26 +574,26 @@ private static void getData() {
571574

572575
in.close();
573576

574-
Pattern SPLIT_HEIRARCHY = PatternCache.get("\\s*:\\s*");
575-
toHeirarchy = new TreeMap<>();
576-
// for (String code : toHeirarchyTemp.keySet()) {
577-
// System.out.println(code + " => " + toHeirarchyTemp.get(code));
577+
Pattern SPLIT_HIERARCHY = PatternCache.get("\\s*:\\s*");
578+
toHierarchy = new TreeMap<>();
579+
// for (String code : toHierarchyTemp.keySet()) {
580+
// System.out.println(code + " => " + toHierarchyTemp.get(code));
578581
// }
579-
for (String code : toHeirarchyTemp.keySet()) {
580-
String valueString = toHeirarchyTemp.get(code);
581-
String[] values = SPLIT_HEIRARCHY.split(valueString);
582+
for (String code : toHierarchyTemp.keySet()) {
583+
String valueString = toHierarchyTemp.get(code);
584+
String[] values = SPLIT_HIERARCHY.split(valueString);
582585
for (String value : values) {
583-
if (toScope.get(value) == null && toHeirarchyTemp.get(value) == null) {
586+
if (toScope.get(value) == null && toHierarchyTemp.get(value) == null) {
584587
throw new IllegalArgumentException(
585-
"Unexpected value in heirarchy:\t"
588+
"Unexpected value in hierarchy:\t"
586589
+ value
587590
+ "\t"
588591
+ code
589592
+ "\t"
590593
+ valueString);
591594
}
592595
}
593-
toHeirarchy.put(code, Arrays.asList(values));
596+
toHierarchy.put(code, Arrays.asList(values));
594597
}
595598
// System.out.println("Size:\t" + toNames.size());
596599

@@ -602,7 +605,7 @@ private static void getData() {
602605
fromBiblio3 = Collections.unmodifiableMap(fromBiblio3);
603606
toScope = Collections.unmodifiableMap(toScope);
604607
toType = Collections.unmodifiableMap(toType);
605-
toHeirarchy = Collections.unmodifiableMap(toHeirarchy);
608+
toHierarchy = Collections.unmodifiableMap(toHierarchy);
606609

607610
toNames.freeze();
608611
toRetirements.freeze();

0 commit comments

Comments
 (0)