Skip to content

Commit a5a80ba

Browse files
authored
CLDR-18996 fix GenerateLanguageContainment (#5058)
1 parent 95149ea commit a5a80ba

File tree

6 files changed

+76
-12
lines changed

6 files changed

+76
-12
lines changed

docs/site/development/updating-codes/updating-language-groups.md

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,36 @@
22
title: Updating Language Groups
33
---
44

5-
# Updating Language Groups
6-
7-
This file has not yet been converted: see https://sites.google.com/unicode.org/cldr/development/updating-codes/updating-language-groups
5+
1. (prerequisite: being able to build CLDR locally with [Maven](/development/maven)
6+
2. Run GenerateLanguageContainment, through eclipse or maven.
7+
Here is how you can run it with Maven:
8+
1. cd cldr/tools
9+
2. mvn \-DCLDR\_DIR=*/path/to/***cldr** \-Dexec.mainClass=org.unicode.cldr.tool.**GenerateLanguageContainment** exec:java \-pl cldr-rdf
10+
3. This will create {workspace}/cldr/common/supplemental/languageGroup.xml
11+
1. Copy the console log into debugLog.txt to help in debugging problems. (Should modify tool to do this.)
12+
2. Run TestLanguageGroup and fix problems if necessary:
13+
3. OVERRIDES: If a language code moves or is deleted, consider adding override to GenerateLanguageContainment
14+
1. Additions go in EXTRA\_PARENT\_CHILDREN
15+
1. If you add something, you might have to remove it someplace else. You'll get a "duplicate parent" error in TestLanguageGroup
16+
2. Removals go in REMOVE\_PARENT\_CHILDREN
17+
1. "\*" for value means all.
18+
4. Example: pcm \[Nigerian Pidgin\] \[pcm\] \- not in languages/isolates.json nor languageGroup.xml
19+
1. Go to [https://en.wikipedia.org/wiki/Nigerian\_Pidgin](https://en.wikipedia.org/wiki/Nigerian_Pidgin) (by searching)
20+
2. Under language family, click on the ancestor. Keep clicking until you find a language group with an "[**ISO 639-2**](https://en.wikipedia.org/wiki/ISO_639-2) **/ [5](https://en.wikipedia.org/wiki/ISO_639-5)**" code.
21+
3. Get the ancestor chain (see below), we find kri
22+
4. Go to GenerateLanguageContainment.EXTRA\_PARENT\_CHILDREN, add .put("kri", "pcm")
23+
5. Example: inc \[Indic\] is not an ancestor of trw \[Torwali\]: expected true
24+
1. Go to [https://en.wikipedia.org/wiki/Torwali\_language](https://en.wikipedia.org/wiki/Torwali_language) (find by searching).
25+
1. Under language family, click on the ancestor. Keep clicking until you find a language group with an "[**ISO 639-2**](https://en.wikipedia.org/wiki/ISO_639-2) **/ [5](https://en.wikipedia.org/wiki/ISO_639-5)**" code.
26+
2. That says 'inc', so we have a case where wikidata is out of sync with wikipedia.
27+
3. Go to GenerateLanguageContainment.EXTRA\_PARENT\_CHILDREN, add .put("inc", "trw")
28+
6. Occasionally LanguageGroup.java will need some fixes instead, once you have done the research.
29+
1. Once you are done, rerun GenerateLanguageContainment and TestLanguageGroup
30+
1. You may need to repeat the process to get a full chain of ancestors.
31+
2. Example: For X Creoles, we use the X, so for the first example above we needed .put("en", "kri")
32+
4. Run the tool **ChartLanguageGroups**
33+
1. Review {workspace}/../cldr-staging/docs/charts/*\<release\>*/supplemental/language\_groups.html
34+
2. Check in
35+
1. {workspace}/cldr/common/supplemental/languageGroup.xml
36+
2. {workspace}/cldr/tools/cldr-rdf/external/\*.tsv *( intermediate tables, for tracking)*
37+
3. Chart: {workspace}/../cldr-staging/docs/charts/*\<release\>*/supplemental/language\_groups.html

tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRFile.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2917,6 +2917,9 @@ public String getWinningPath(String path) {
29172917
* @return
29182918
*/
29192919
public Collection<String> getExtraPaths() {
2920+
if (this.getDtdType() != DtdType.ldml) {
2921+
return Collections.emptySet();
2922+
}
29202923
Set<String> toAddTo = new HashSet<>(getRawExtraPaths());
29212924
for (String path : this.iterableWithoutExtras()) {
29222925
toAddTo.remove(path);
@@ -2930,6 +2933,9 @@ public Collection<String> getExtraPaths() {
29302933
* @return
29312934
*/
29322935
public Collection<String> getExtraPaths(String prefix, Collection<String> toAddTo) {
2936+
if (this.getDtdType() != DtdType.ldml) {
2937+
return Collections.emptySet();
2938+
}
29332939
for (String item : getRawExtraPaths()) {
29342940
if (item.startsWith(prefix)
29352941
&& dataSource.getValueAtPath(item) == null) { // don't use getStringValue, since
@@ -2949,6 +2955,9 @@ public Collection<String> getExtraPaths(String prefix, Collection<String> toAddT
29492955
* @return
29502956
*/
29512957
public Set<String> getRawExtraPaths() {
2958+
if (this.getDtdType() != DtdType.ldml) {
2959+
return Collections.emptySet();
2960+
}
29522961
if (extraPaths == null) {
29532962
extraPaths = ImmutableSet.<String>builder().addAll(getRawExtraPathsPrivate()).build();
29542963
if (DEBUG) {

tools/cldr-code/src/main/java/org/unicode/cldr/util/DiffLanguageGroups.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,9 @@ public static void showDiff(String title, Set<String> oldMinusOther) {
280280
}
281281

282282
public static String show(String languageCode) {
283-
return languageCode.equals("mul") ? "Ω" : getName(languageCode) + " ⁅" + languageCode + "⁆";
283+
return languageCode.equals("mul")
284+
? languageCode
285+
: getName(languageCode) + " ⁅" + languageCode + "⁆";
284286
}
285287

286288
public static String getName(String languageCode) {

tools/cldr-code/src/main/java/org/unicode/cldr/util/DtdData.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1023,8 +1023,16 @@ public int xpathComparator(XPathParts a, XPathParts b) {
10231023
String baseA = a.getElement(0);
10241024
String baseB = b.getElement(0);
10251025
if (!ROOT.name.equals(baseA) || !ROOT.name.equals(baseB)) {
1026+
final XPathParts oddity = (ROOT.name.equals(baseA) ? b : a);
1027+
final String baseOddity = oddity.getElement(0);
1028+
final String elementOddity = oddity.getElement(-1);
10261029
throw new IllegalArgumentException(
1027-
"Comparing different DTDs: " + ROOT.name + ", " + baseA + ", " + baseB);
1030+
"Comparing different DTDs: This comparator is for DTD "
1031+
+ ROOT.name
1032+
+ ", but attempted compare with DTD "
1033+
+ baseOddity
1034+
+ " in: "
1035+
+ oddity.toString());
10281036
}
10291037
int min = Math.min(a.size(), b.size());
10301038
Element parent = ROOT;

tools/cldr-code/src/main/java/org/unicode/cldr/util/TempPrintWriter.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
* the old file (except for date), then it is deleted. Otherwise it replaces the target file. Moved
2020
* from UnicodeTools.
2121
*
22+
* <p>dontReplaceFile() may be called before close() to prevent replacement (such as if a processing
23+
* error occurred)
24+
*
2225
* @author markdavis
2326
*/
2427
public class TempPrintWriter extends Writer {
@@ -55,6 +58,10 @@ public static TempPrintWriter openUTF8Writer(String dir, String filename) {
5558
return new TempPrintWriter(new File(dir, filename));
5659
}
5760

61+
public static TempPrintWriter openUTF8Writer(File dir, String filename) {
62+
return new TempPrintWriter(new File(dir, filename));
63+
}
64+
5865
public TempPrintWriter(String dir, String filename) {
5966
this(new File(dir, filename));
6067
}
@@ -76,6 +83,10 @@ public TempPrintWriter(File file) {
7683
}
7784
}
7885

86+
/**
87+
* Will prevent the file from being overwritten. Call this before close() if something goes
88+
* wrong during write.
89+
*/
7990
public void dontReplaceFile() {
8091
noReplace = true;
8192
}

tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/GenerateLanguageContainment.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.unicode.cldr.util.StandardCodes;
5151
import org.unicode.cldr.util.StandardCodes.LstrField;
5252
import org.unicode.cldr.util.StandardCodes.LstrType;
53+
import org.unicode.cldr.util.TempPrintWriter;
5354
import org.unicode.cldr.util.Validity;
5455
import org.unicode.cldr.util.Validity.Status;
5556

@@ -461,8 +462,8 @@ void run(String[] args) throws IOException {
461462
}
462463
}
463464
System.out.println("Writing " + "skippingCodes.tsv");
464-
try (PrintWriter w =
465-
FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) {
465+
try (TempPrintWriter w =
466+
TempPrintWriter.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) {
466467
// TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header
467468
skipping.forEach(e -> w.println(e));
468469
}
@@ -568,12 +569,15 @@ void run(String[] args) throws IOException {
568569
newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", "");
569570
printXML(newFile, parentToChild);
570571

571-
try (PrintWriter outFile =
572-
FileUtilities.openUTF8Writer(
572+
try (TempPrintWriter outFile =
573+
TempPrintWriter.openUTF8Writer(
573574
CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) {
574-
newFile.write(outFile);
575-
} catch (IOException e1) {
576-
throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1);
575+
try {
576+
newFile.write(outFile.asPrintWriter());
577+
} catch (Throwable e1) {
578+
outFile.dontReplaceFile();
579+
throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1);
580+
}
577581
}
578582

579583
// for (Entry<String,String> entry : childToParent.entries()) {

0 commit comments

Comments
 (0)