Skip to content

Commit 50fed54

Browse files
srl295macchiati
andauthored
CLDR-19192 use a case insensensitive collator for TestAnnotations.TestUniqueness (#5276)
Co-authored-by: Mark Davis <mark@unicode.org>
1 parent cf128d4 commit 50fed54

File tree

2 files changed

+65
-29
lines changed

2 files changed

+65
-29
lines changed

tools/cldr-code/src/main/java/org/unicode/cldr/util/CollatorHelper.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package org.unicode.cldr.util;
22

3+
import com.ibm.icu.lang.UCharacter;
34
import com.ibm.icu.text.Collator;
45
import com.ibm.icu.text.RuleBasedCollator;
56
import com.ibm.icu.util.ULocale;
7+
import java.util.Comparator;
68

79
public final class CollatorHelper {
810
public static final RuleBasedCollator EMOJI_COLLATOR = makeEmojiCollator();
@@ -13,6 +15,7 @@ public final class CollatorHelper {
1315
public static final RuleBasedCollator ROOT_PRIMARY = makeRootPrimary();
1416
public static final RuleBasedCollator ROOT_PRIMARY_SHIFTED = makeRootPrimaryShifted();
1517
public static final RuleBasedCollator ROOT_SECONDARY = makeRootSecondary();
18+
public static final Comparator<String> CASE_FOLDED = makeCaseFolded();
1619

1720
private static RuleBasedCollator makeEmojiCollator() {
1821
ULocale uLocale = ULocale.forLanguageTag("en-u-co-emoji");
@@ -64,4 +67,21 @@ private static RuleBasedCollator makeRootSecondary() {
6467
col.setStrength(Collator.SECONDARY);
6568
return (RuleBasedCollator) col.freeze();
6669
}
70+
71+
private static Comparator<String> makeCaseFolded() {
72+
// make our own copy to avoid static ordering
73+
final RuleBasedCollator SECONDARY = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
74+
{
75+
SECONDARY.setStrength(RuleBasedCollator.SECONDARY);
76+
SECONDARY.freeze();
77+
}
78+
return new Comparator<String>() {
79+
@Override
80+
public int compare(String o1, String o2) {
81+
String n1 = UCharacter.foldCase(o1, 0);
82+
String n2 = UCharacter.foldCase(o2, 0);
83+
return n1.compareTo(n2);
84+
}
85+
};
86+
}
6787
}

tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAnnotations.java

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import com.google.common.collect.ImmutableSet;
77
import com.google.common.collect.ImmutableSortedSet;
88
import com.google.common.collect.Multimap;
9+
import com.google.common.collect.Ordering;
910
import com.google.common.collect.TreeMultimap;
1011
import com.ibm.icu.impl.Row;
1112
import com.ibm.icu.impl.Row.R3;
@@ -29,13 +30,16 @@
2930
import java.util.TreeMap;
3031
import java.util.TreeSet;
3132
import java.util.regex.Pattern;
33+
import java.util.stream.Collectors;
34+
import java.util.stream.Stream;
3235
import org.unicode.cldr.test.CoverageLevel2;
3336
import org.unicode.cldr.util.Annotations;
3437
import org.unicode.cldr.util.Annotations.AnnotationSet;
3538
import org.unicode.cldr.util.CLDRConfig;
3639
import org.unicode.cldr.util.CLDRFile;
3740
import org.unicode.cldr.util.CLDRPaths;
3841
import org.unicode.cldr.util.CldrUtility;
42+
import org.unicode.cldr.util.CollatorHelper;
3943
import org.unicode.cldr.util.Emoji;
4044
import org.unicode.cldr.util.Factory;
4145
import org.unicode.cldr.util.Level;
@@ -281,34 +285,41 @@ public void TestCategories() {
281285
}
282286
}
283287

288+
// TODO CLDR-16947 - this test should migrate into
289+
// CheckDisplayCollisions-run-against-derived-annotations (see isuse)
290+
// TODO CLDR-19189
284291
public void TestUniqueness() {
285-
// if (logKnownIssue(
286-
// "CLDR-16947", "skip duplicate TestUniqueness in favor of
287-
// CheckDisplayCollisions")) {
288-
// return;
289-
// }
290292
Set<String> locales = new TreeSet<>();
291293
locales.add("en");
292294
if (!TEST_ONLY_ENGLISH_UNIQUENESS) {
293295
locales.addAll(Annotations.getAvailable());
294296
locales.remove("root");
295297
}
296298
/*
297-
* Note: "problems" here is a work-around for what appears to be a deficiency
298-
* in the function sourceLocation, involving the call stack. Seemingly sourceLocation
299-
* can't handle the "->" notation used for parallelStream().forEach() if
300-
* uniquePerLocale calls errln directly.
299+
* "problems" is here to collect and sort issues in parallel,
300+
* and avoid issues calling errln() from a lambda.
301301
*/
302-
Set<String> problems = new HashSet<>();
303-
locales.parallelStream().forEach(locale -> uniquePerLocale(locale, problems));
302+
Set<String> problems =
303+
locales.parallelStream()
304+
.flatMap(locale -> uniquePerLocale(locale))
305+
.collect(Collectors.toCollection(() -> new TreeSet<>()));
304306
if (!problems.isEmpty()) {
307+
if (logKnownIssue(
308+
"CLDR-19189",
309+
"cased collision in annotations:\n" + String.join("\n", problems))) {
310+
return;
311+
}
305312
problems.forEach(s -> errln(s));
306313
}
307314
}
308315

309-
private void uniquePerLocale(String locale, Set<String> problems) {
316+
private Stream<String> uniquePerLocale(String locale) {
317+
Set<String> problems = new TreeSet<>();
310318
logln("uniqueness: " + locale);
311-
Multimap<String, String> nameToEmoji = TreeMultimap.create();
319+
// use a case insensitive collator
320+
// 'value' is originalName -> emoji
321+
Multimap<String, Pair<String, String>> nameToEmoji =
322+
TreeMultimap.create(CollatorHelper.CASE_FOLDED, Ordering.natural());
312323
AnnotationSet data = Annotations.getDataSet(locale);
313324
for (String emoji : Emoji.getAllRgi()) {
314325
String name = data.getShortName(emoji);
@@ -319,37 +330,42 @@ private void uniquePerLocale(String locale, Set<String> problems) {
319330
throw new IllegalArgumentException(
320331
CldrUtility.INHERITANCE_MARKER + " in name of " + emoji + " in " + locale);
321332
}
322-
nameToEmoji.put(name, emoji);
333+
nameToEmoji.put(name, Pair.of(name, emoji));
323334
}
324335
Multimap<String, String> duplicateNameToEmoji = null;
325-
for (Entry<String, Collection<String>> entry : nameToEmoji.asMap().entrySet()) {
336+
for (Entry<String, Collection<Pair<String, String>>> entry :
337+
nameToEmoji.asMap().entrySet()) {
326338
String name = entry.getKey();
327-
Collection<String> emojis = entry.getValue();
339+
final Collection<Pair<String, String>> emojis = entry.getValue();
340+
if (duplicateNameToEmoji == null) {
341+
duplicateNameToEmoji = TreeMultimap.create();
342+
}
328343
if (emojis.size() > 1) {
329-
synchronized (problems) {
330-
if (problems.add(
331-
"Duplicate name in "
332-
+ locale
333-
+ ": “"
334-
+ name
335-
+ "” for "
336-
+ Joiner.on(" & ").join(emojis))) {
337-
int debug = 0;
344+
final String prefix = "Duplicate name in " + locale + ": “" + name + "” for ";
345+
final StringBuilder remainder = new StringBuilder();
346+
for (final Pair<String, String> emoji : emojis) {
347+
duplicateNameToEmoji.put(emoji.getFirst(), emoji.getSecond());
348+
if (remainder.length() > 0) { // ampersand after the first item
349+
remainder.append(" & ");
350+
}
351+
remainder.append("“").append(emoji.getSecond()).append("”");
352+
if (!emoji.getFirst().equals(name)) {
353+
// case-insensitive collision, so note that
354+
remainder.append("(≈“" + emoji.getFirst() + "”) ");
338355
}
339356
}
340-
if (duplicateNameToEmoji == null) {
341-
duplicateNameToEmoji = TreeMultimap.create();
342-
}
343-
duplicateNameToEmoji.putAll(name, emojis);
357+
problems.add(prefix + remainder.toString());
344358
}
345359
}
346360
if (isVerbose() && duplicateNameToEmoji != null && !duplicateNameToEmoji.isEmpty()) {
361+
// TODO CLDR-16947: the following will print out in an interleaved way due to threading.
347362
System.out.println("\nCollisions");
348363
for (Entry<String, String> entry : duplicateNameToEmoji.entries()) {
349364
String emoji = entry.getValue();
350365
System.out.println(locale + "\t" + eng.getShortName(emoji) + "\t" + emoji);
351366
}
352367
}
368+
return problems.stream();
353369
}
354370

355371
public void testAnnotationPaths() {

0 commit comments

Comments
 (0)