diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ExemplarInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ExemplarInfo.java index 7509336eea3..e1a948a058a 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ExemplarInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ExemplarInfo.java @@ -20,20 +20,22 @@ import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.UnicodeSetPrettyPrinter; /** */ public class ExemplarInfo { - public static UnicodeSet IGNORE = + private static final UnicodeSet IGNORE = new UnicodeSet("[[:sc=unknown:][:script=common:]-[:M:]]").freeze(); - public static UnicodeSet TEST_ENCODING = new UnicodeSet("[[:any:]-[:c:] [:cc:]]").freeze(); + private static final UnicodeSet TEST_ENCODING = + new UnicodeSet("[[:any:]-[:c:] [:cc:]]").freeze(); - public static final Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkc", Mode.DECOMPOSE); + private static final Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkc", Mode.DECOMPOSE); - public static final Normalizer2 nfd = Normalizer2.getInstance(null, "nfc", Mode.DECOMPOSE); + private static final Normalizer2 nfd = Normalizer2.getInstance(null, "nfc", Mode.DECOMPOSE); - public static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); + private static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); private static final Normalizer2 nfkdMinus = new FilteredNormalizer2( @@ -45,8 +47,8 @@ public class ExemplarInfo { + "[:dt=Canonical:]]") .freeze()); - private static Map languageToExemplars = new TreeMap<>(); - private static UnicodeSet az = new UnicodeSet("[a-z]").freeze(); + private static final Map languageToExemplars = new TreeMap<>(); + private static final UnicodeSet az = new UnicodeSet("[a-z]").freeze(); static { @@ -97,7 +99,7 @@ public class ExemplarInfo { } // http://ja.wikipedia.org/wiki/学年別漢字配当表, http://kanji-a.seesaa.net/category/2203790-1.html - private static UnicodeMap JapaneseEducationLevels = + private static final UnicodeMap JapaneseEducationLevels = new UnicodeMap() .putAll( new UnicodeSet( @@ -128,68 +130,22 @@ public class ExemplarInfo { "[丈与且丘丙丹乏乙乾了互井亜享亭介仙仰企伏伐伯伴伸伺但佐佳併侍依侮侯侵促俊俗俸倒倣倫倹偉偏偵偶偽傍傑傘催債傾僕僚僧儀儒償充克免兼冒冗冠准凍凝凡凶凸凹刃刈刑到刺削剖剛剣剤剰劣励劾勅勘募勧勲勺匁匠匹匿升卑卓占即却卸厄厘又及双叔叙叫召吉吏吐吟含吹呈呉咲哀哲唆唇唐唯啓喚喝喪喫嗣嘆嘱噴嚇囚圏坊坑坪垣埋執培堀堅堕堤堪塀塁塊塑塔塗塚塾墜墨墳墾壁壇壊壌壮壱奇奉契奔奥奨奪奴如-妄妊妙妥妨姓姫姻威娘娠娯婆婚婿媒嫁嫌嫡嬢孔孤宜宰宴宵寂寛寝寡寧審寮寿封尉尋尚尼-尿屈履屯岐岬岳峠峡峰崇崎崩巡巧巨帆帝帥帽幅幣幻幽幾床庶庸廃廉廊廷弊弐弔弦弧弾彩彫彰影彼征徐御循微徴徹忌忍忙怒怖怠怪恋恐恒恥恨恭恵悔悟悠患悦悩悼惑惜惨惰愁愉愚慈慌慎慕慢慨慮慰慶憂憎憤憩憶憾懇懐懲懸戒戯戻房扇扉払扱扶抄把抑抗抜択披抱抵抹押抽拍拐拒拓拘拙拠括拷挑挟振挿捕捜据掃掌排掘掛控措掲描揚換握援揺搬搭携搾摂摘摩撃撤撮撲擁擦擬攻敏敢敷斉斎斗斜斤斥施旋既旨旬昆昇是普晶暁暇暦暫曇更曹替朕朱朴朽杉杯析枠枢枯架柄某柔柳栓核栽桃桑桟棄棋棚棟棺楼概槽欄欧欺款歓歳殉殊殖殴殻殿汁汗汚江沈沖没沢沸沼況泊泌泡泥泰洞津洪浄浜浦浪浮浸涙涯涼淑淡添渇渉渋渓渡渦湾湿溝溶滅滋滑滝滞滴漂漆漏漠漫漬漸潜潟潤澄濁濃濫濯瀬炉炊炎為烈焦煙煩煮燥爆爵牲犠狂狩狭猛猟猫献猶猿獄獣獲玄珍珠琴環璽瓶甘甚甲畔畜畝畳疎疫疲疾症痘痢痴療癒癖皆盆盗監盤盲盾眠眺睡督瞬矛矯砕砲硝硫硬碁碑磨礁礎祈祉祥禅禍秀租秩称稚稲稼稿穂穏穫突窃窒窮窯竜端符筒箇範篤簿籍粋粒粗粘粛粧糧糾紋紛紡索紫累紳紹紺絞絡継維綱網緊緒締緩緯縁縄縛縫繁繊繕繭繰缶罰罷羅翁翻翼耐耗聴肌肖肝肢肩肪肯胆胎胞胴脂脅脚脱脹腐腕腰膚膜膨臭致舗舞舟般舶艇艦芋芝芳苗茂茎荒荘菊菌菓華葬蓄薄薦薪薫藩藻虐虚虜虞蚊蛇蛍蛮融衝衡衰衷袋被裂裕裸褐褒襟襲覆覇触訂託訟訴診詐詔詠詰該詳誇誉誓誘請諭諮諾謀謁謄謙謡謹譜譲豚豪貞貢販貫賄賊賓賜賠賢賦購贈赦赴超越趣距跡跳践踊踏躍軌軒軟軸較載輝輩轄辛辱込迅迎迫迭逃透逐逓途逝逮逸遂遅遇遍違遣遭遮遵遷避還邦邪邸郊郎郭酌酔酢酪酬酵酷醜醸釈釣鈍鈴鉛鉢銃銑銘鋭鋳錘錠錬錯鍛鎖鎮鐘鑑閑閥閲闘阻附陣陥陪陰陳陵陶隅隆随隔隠隣隷隻雄雅雇雌離雰零雷需震霊霜霧露靴韻響項頑頒頻頼顕顧飢飽飾餓香駄駆駐騎騒騰驚髄髪鬼魂魅魔鮮鯨鶏麗麻黙鼓齢]"), "9") .freeze(); - // static { - // for (Integer value : Builder.with(new - // TreeSet()).addAll(JapaneseEducationLevels.values()).get()) { - // System.out.println(".putAll(new UnicodeSet(\"" + - // JapaneseEducationLevels.getSet(value).toPattern(false) + "\"), " - // + value + ")"); - // } - // } - private UnicodeSet exemplars; - UnicodeSet exemplarsX; - UnicodeSet auxiliariesX; - UnicodeSet exemplarScripts; - UnicodeSet auxiliaryScripts; - UnicodeMap educationLevels = new UnicodeMap<>(); + private final UnicodeMap educationLevels = new UnicodeMap<>(); - static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); + private static final Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); private ExemplarInfo(String main, String aux) { this(new UnicodeSet(main), new UnicodeSet(aux), null); } - public enum Status { - O, - M, - A, - S, - T, - X, - N - } - - public Status getStatus(String sequence) { - if (IGNORE.containsAll(sequence)) { - return Status.O; - } - if (exemplarsX.containsAll(sequence)) { - return Status.M; - } - if (auxiliariesX.containsAll(sequence)) { - return Status.A; - } - if (exemplarScripts.containsAll(sequence)) { - return Status.S; - } - if (auxiliaryScripts.containsAll(sequence)) { - return Status.T; - } - return Status.X; - } - - public UnicodeSet getExemplars() { - return exemplars; - } - private ExemplarInfo(UnicodeSet exemplars1, UnicodeSet auxiliary1, ULocale locale) { // check that the aux exemplars include all or none of a-z if (auxiliary1 == null) { auxiliary1 = new UnicodeSet(); } - exemplars = ExemplarInfo.flatten(exemplars1, locale).freeze(); + UnicodeSet exemplars = ExemplarInfo.flatten(exemplars1, locale).freeze(); auxiliary1.addAll(exemplars1); if (auxiliary1.containsSome(az) && !auxiliary1.containsAll(az)) { System.err.println( @@ -199,10 +155,12 @@ private ExemplarInfo(UnicodeSet exemplars1, UnicodeSet auxiliary1, ULocale local + UnicodeSetPrettyPrinter.ROOT_ICU.format(auxiliary1)); } auxiliary1.addAll(az); - auxiliariesX = ExemplarInfo.flatten(auxiliary1, locale).addAll(IGNORE).freeze(); - exemplarsX = new UnicodeSet(exemplars).addAll(IGNORE).freeze(); - exemplarScripts = expandScripts(exemplars1, locale).addAll(IGNORE).freeze(); - auxiliaryScripts = expandScripts(auxiliary1, locale).addAll(IGNORE).freeze(); + // Note: after automatic refactoring to remove dead code, the following + // four lines remain, but it is doubtful whether they serve any purpose. + ExemplarInfo.flatten(auxiliary1, locale).addAll(IGNORE).freeze(); + new UnicodeSet(exemplars).addAll(IGNORE).freeze(); + expandScripts(exemplars1, locale).addAll(IGNORE).freeze(); + expandScripts(auxiliary1, locale).addAll(IGNORE).freeze(); if (locale != null) { if (locale.equals(ULocale.JAPANESE)) { educationLevels.putAll(getCharset("Shift_JIS"), "SJIS"); @@ -256,18 +214,11 @@ private UnicodeSet expandScripts(UnicodeSet source, ULocale locale) { return ExemplarInfo.flatten(scripts, locale); } - public static String getCldrLanguage(String language) { + private static String getCldrLanguage(String language) { return LanguageCodeConverter.fromGoogleLocaleId(language); - // String cldrLanguage = language.replace("-", "_"); - // if (cldrLanguage.equals("tl")) { - // cldrLanguage = "fil"; - // } else if (cldrLanguage.equals("no")) { - // cldrLanguage = "nb"; - // } - // return cldrLanguage; } - public static String specialNormalize(String marks, ULocale locale) { + private static String specialNormalize(String marks, ULocale locale) { marks = ExemplarInfo.nfd.normalize(marks); marks = locale == null @@ -278,7 +229,7 @@ public static String specialNormalize(String marks, ULocale locale) { return marks; } - public static UnicodeSet flatten(UnicodeSet exemplar1, ULocale locale) { + private static UnicodeSet flatten(UnicodeSet exemplar1, ULocale locale) { if (exemplar1 == null) { return null; } @@ -300,24 +251,27 @@ public static UnicodeSet flatten(UnicodeSet exemplar1, ULocale locale) { for (int i = 0; i < s.length(); i += Character.charCount(cp)) { cp = s.codePointAt(i); int type = UCharacter.getType(cp); - if (type == UCharacter.ENCLOSING_MARK - || type == UCharacter.NON_SPACING_MARK - || type == UCharacter.COMBINING_SPACING_MARK) { - // continue; - } else { - // add up to now, and reset pointer - if (i > lastPos) { - result.add(s.substring(lastPos, i)); - } - lastPos = i; + switch (type) { + case UCharacter.ENCLOSING_MARK: + case UCharacter.NON_SPACING_MARK: + case UCharacter.COMBINING_SPACING_MARK: + // continue; + break; + default: + // add up to now, and reset pointer + if (i > lastPos) { + result.add(s.substring(lastPos, i)); + } + lastPos = i; + break; } } - result.add(s.substring(lastPos, s.length())); + result.add(s.substring(lastPos)); } return result; } - public static ExemplarInfo make(String language, Set missingExemplars) { + private static ExemplarInfo make(String language, Set missingExemplars) { String cldrLanguage = ExemplarInfo.getCldrLanguage(language); ExemplarInfo exemplarInfo = languageToExemplars.get(cldrLanguage); if (exemplarInfo == null) { @@ -329,8 +283,8 @@ public static ExemplarInfo make(String language, Set missingExemplars) { System.out.print(""); } CLDRFile file = ExemplarInfo.cldrFactory.make(cldrLanguage, true); - exemplars1 = file.getExemplarSet("", WinningChoice.WINNING, 0); - auxiliary1 = file.getExemplarSet("auxiliary", WinningChoice.WINNING, 0); + exemplars1 = file.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); + auxiliary1 = file.getExemplarSet(ExemplarType.auxiliary, WinningChoice.WINNING); } catch (Exception e) { System.out.println( "Can't read " @@ -350,7 +304,7 @@ public static ExemplarInfo make(String language, Set missingExemplars) { return exemplarInfo; } - public String getEducationLevel(CharSequence input) { + private String getEducationLevel(CharSequence input) { String result = null; for (CodePoints cps = new CodePoints(input); cps.next(); ) { String level = educationLevels.get(cps.getCodePoint()); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/Misc.java b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/Misc.java index 628e4187e8d..a0189bc176d 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/Misc.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/Misc.java @@ -37,6 +37,7 @@ import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CollatorHelper; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.LanguageTagParser; import org.unicode.cldr.util.LocaleIDParser; @@ -335,7 +336,7 @@ private static void showExemplarSize() { } String englishName = english.nameGetter().getNameFromIdentifier(baseLanguage); CLDRFile cldrFile = factory.make(baseLanguage, false); - UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet set = cldrFile.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); int script = -1; for (String s : set) { int cp = s.codePointAt(0); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CasingInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CasingInfo.java index c74ac700d6f..d8834503e84 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CasingInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CasingInfo.java @@ -22,6 +22,7 @@ import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.LocaleIDParser; import org.unicode.cldr.util.PatternCache; @@ -141,7 +142,7 @@ private Map generateCasingInformation(String localePattern) { // Save casing information about the locale. CLDRFile file = cldrFactory.make(localeID, true); - UnicodeSet examplars = file.getExemplarSet("", WinningChoice.NORMAL); + UnicodeSet examplars = file.getExemplarSet(ExemplarType.main, WinningChoice.NORMAL); localeUsesCasing.put(localeID, examplars.containsSome(allCaps)); createCasingXml(localeID, CheckConsistentCasing.getSamples(file)); } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckExemplars.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckExemplars.java index c5a72a6869e..ae7bea2d1ae 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckExemplars.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckExemplars.java @@ -11,136 +11,25 @@ import java.util.BitSet; import java.util.Comparator; import java.util.List; -import java.util.Locale; import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.ComparatorUtilities; +import org.unicode.cldr.util.ExemplarSets; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.SimpleUnicodeSetFormatter; import org.unicode.cldr.util.UnicodeSetPrettyPrinter; import org.unicode.cldr.util.XPathParts; public class CheckExemplars extends FactoryCheckCLDR { - - public static final boolean USE_PUNCTUATION = false; private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true; private static final String[] QUOTE_ELEMENTS = { "quotationStart", "quotationEnd", "alternateQuotationStart", "alternateQuotationEnd" }; - Collator col; - boolean isRoot; - SimpleUnicodeSetFormatter displayFormatter; - UnicodeSetPrettyPrinter rawFormatter; - - static final UnicodeSet HangulSyllables = - new UnicodeSet("[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze(); - - public static final UnicodeSet AlwaysOK; - - static { - if (USE_PUNCTUATION) { - AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]"); - } else { - AlwaysOK = - new UnicodeSet( - "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" - + "[[؉][་ །༌][ཱ]‎‎{য়}য়]" - + // TODO Fix this Hack - "-[❮❯]]"); // [\\u200c-\\u200f] - // [:script=common:][:script=inherited:] - } - AlwaysOK.freeze(); - } - - // TODO Fix some of these characters - private static final UnicodeSet SPECIAL_ALLOW = - new UnicodeSet( - "[\u061C\\u200E\\u200F\\u200c\\u200d" - + "‎‎‎[\u064B\u064E-\u0651\u0670]‎[:Nd:]‎[\u0951\u0952]‎[\u064B-\u0652\u0654-\u0657\u0670]‎[\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652]‎[\\u02BB\\u02BC][\u0CE6-\u0CEF]‎‎[\u0966-\u096F]" - + "‎‎‎[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore - // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] - ) - .freeze(); // add RLM, LRM [\u200C\u200D]‎ - - public static final UnicodeSet UAllowedInExemplars = - new UnicodeSet( - "[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] - .removeAll(AlwaysOK) // this will remove some - // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we - // restore them - // in SPECIAL_ALLOW - .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎ - .freeze(); - - public static final UnicodeSet UAllowedInNumbers = - new UnicodeSet( - "[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] - .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎ - .freeze(); - - public static final UnicodeSet AllowedInExemplars = - new UnicodeSet(UAllowedInExemplars) - .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]")) - .freeze(); - - private static final UnicodeSet ALLOWED_IN_NUMBERS_NOT_IN_MAIN = - new UnicodeSet("[[:Numeric_Type=Decimal:]]").freeze(); - - private static final UnicodeSet ALLOWED_IN_MAIN = - new UnicodeSet(AllowedInExemplars).removeAll(ALLOWED_IN_NUMBERS_NOT_IN_MAIN).freeze(); - - public static final UnicodeSet ALLOWED_IN_PUNCTUATION = - new UnicodeSet("[[:P:][:S:]-[:Sc:]]").freeze(); - - public static final UnicodeSet ALLOWED_IN_AUX = - new UnicodeSet(AllowedInExemplars) - .addAll(ALLOWED_IN_PUNCTUATION) - .removeAll(AlwaysOK) // this will remove some - // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we - // restore them - // in SPECIAL_ALLOW - .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎ - .freeze(); - - public enum ExemplarType { - main( - ALLOWED_IN_MAIN, - "(specific-script - uppercase - invisibles - numbers + \u0130)", - true), - auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", true), - punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", false), - punctuation_auxiliary(ALLOWED_IN_PUNCTUATION, "punctuation-auxiliary", false), - punctuation_person(ALLOWED_IN_PUNCTUATION, "punctuation-person", false), - numbers(UAllowedInNumbers, "(specific-script - invisibles)", false), - numbers_auxiliary(UAllowedInNumbers, "(specific-script - invisibles)", false), - index(UAllowedInExemplars, "(specific-script - invisibles)", false), - // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", - // false) - ; - - public final UnicodeSet allowed; - public final UnicodeSet toRemove; - public final String message; - public final boolean convertUppercase; - - ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) { - if (!allowed.isFrozen()) { - throw new IllegalArgumentException("Internal Error"); - } - this.allowed = allowed; - this.message = message; - this.toRemove = new UnicodeSet(allowed).complement().freeze(); - this.convertUppercase = convertUppercase; - } - - public static ExemplarType from(String name) { - return name == null - ? ExemplarType.main - : ExemplarType.valueOf(name.replace('-', '_').toLowerCase(Locale.ROOT)); - } - } + private boolean isRoot; + private UnicodeSetPrettyPrinter rawFormatter; public CheckExemplars(Factory factory) { super(factory); @@ -155,17 +44,16 @@ public CheckCLDR handleSetCldrFileToCheck( super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors); String locale = cldrFileToCheck.getLocaleID(); isRoot = cldrFileToCheck.getLocaleID().equals("root"); - col = ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.IDENTICAL); + Collator col = ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.IDENTICAL); Collator spaceCol = ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.PRIMARY); - displayFormatter = new SimpleUnicodeSetFormatter((Comparator) col); rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol); // check for auxiliary anyway if (!SUPPRESS_AUX_EMPTY_CHECK) { UnicodeSet auxiliarySet = getResolvedCldrFileToCheck() - .getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING); + .getExemplarSet(ExemplarType.auxiliary, CLDRFile.WinningChoice.WINNING); if (auxiliarySet == null) { possibleErrors.add( @@ -183,14 +71,18 @@ public CheckCLDR handleSetCldrFileToCheck( @Override public CheckCLDR handleCheck( String path, String fullPath, String value, Options options, List result) { - if (fullPath == null) return this; // skip paths that we don't have + if (fullPath == null) { + return this; // skip paths that we don't have + } if (!path.contains("/exemplarCharacters")) { if (path.contains("parseLenient")) { checkParse(path, value, result); } return this; } - if (!accept(result)) return this; + if (!accept(result)) { + return this; + } XPathParts oparts = XPathParts.getFrozenInstance(path); final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type"); ExemplarType type = ExemplarType.from(exemplarString); @@ -199,87 +91,27 @@ public CheckCLDR handleCheck( // check relation to auxiliary set try { UnicodeSet mainSet = - getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING); - if (type == ExemplarType.auxiliary) { - UnicodeSet auxiliarySet = SimpleUnicodeSetFormatter.parseLenient(value); - - UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet); - checkMixedScripts("main+auxiliary", combined, result); - - if (auxiliarySet.containsSome(mainSet)) { - UnicodeSet overlap = - new UnicodeSet(mainSet) - .retainAll(auxiliarySet) - .removeAll(HangulSyllables); - if (!overlap.isEmpty()) { - String fixedExemplar1 = rawFormatter.format(overlap); - result.add( - new CheckStatus() - .setCause(this) - .setMainType(CheckStatus.errorType) - .setSubtype(Subtype.auxiliaryExemplarsOverlap) - .setMessage( - "Auxiliary characters also exist in main: \u200E{0}\u200E", - new Object[] {fixedExemplar1})); - } - } - } else if (type == ExemplarType.punctuation) { - // Check that the punctuation exemplar characters include quotation marks. - UnicodeSet punctuationSet = SimpleUnicodeSetFormatter.parseLenient(value); - UnicodeSet quoteSet = new UnicodeSet(); - for (String element : QUOTE_ELEMENTS) { - quoteSet.add( - getResolvedCldrFileToCheck() - .getWinningValue("//ldml/delimiters/" + element)); - } - if (!punctuationSet.containsAll(quoteSet)) { - quoteSet.removeAll(punctuationSet); - // go ahead and list the characters separately, with space between, for clarity. - StringBuilder characters = new StringBuilder(); - for (String item : quoteSet) { - if (characters.length() != 0) { - characters.append(" "); - } - characters.append(item); - } - // String characters = quoteSet.toPattern(false); - CheckStatus message = - new CheckStatus() - .setCause(this) - .setMainType(CheckStatus.warningType) - .setSubtype(Subtype.missingPunctuationCharacters) - .setMessage( - "Punctuation exemplar characters are missing quotation marks for this locale: {0}", - characters); - result.add(message); - } - } else if (type == ExemplarType.index) { - // Check that the index exemplar characters are in case-completed union of main and - // auxiliary exemplars - UnicodeSet auxiliarySet = - getResolvedCldrFileToCheck() - .getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING); - if (auxiliarySet == null) { - auxiliarySet = new UnicodeSet(); - } - UnicodeSet mainAndAuxAllCase = - new UnicodeSet(mainSet) - .addAll(auxiliarySet) - .closeOver(UnicodeSet.ADD_CASE_MAPPINGS); - UnicodeSet indexBadChars = - SimpleUnicodeSetFormatter.parseLenient(value).removeAll(mainAndAuxAllCase); - - if (!indexBadChars.isEmpty()) { - CheckStatus message = - new CheckStatus() - .setCause(this) - .setMainType(CheckStatus.warningType) - .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars) - .setMessage( - "Index exemplars include characters not in main or auxiliary exemplars: {0}", - indexBadChars.toPattern(false)); - result.add(message); - } + getResolvedCldrFileToCheck() + .getExemplarSet(ExemplarType.main, CLDRFile.WinningChoice.WINNING); + UnicodeSet currentSet = SimpleUnicodeSetFormatter.parseLenient(value); + switch (type) { + case auxiliary: + checkAuxiliary(currentSet, mainSet, result); + break; + case punctuation: + checkPunctuation(currentSet, result); + break; + case index: + checkIndex(currentSet, mainSet, result); + break; + case main: + case punctuation_auxiliary: + case punctuation_person: + case numbers: + case numbers_auxiliary: + break; + default: + throw new IllegalArgumentException("Case not handled: " + type); } // check for consistency with RTL @@ -311,11 +143,92 @@ public CheckCLDR handleCheck( } } - } catch (Exception e) { + } catch (Exception ignored) { } // if these didn't parse, checkExemplar will be called anyway at some point return this; } + private void checkAuxiliary( + UnicodeSet currentSet, UnicodeSet mainSet, List result) { + UnicodeSet combined = new UnicodeSet(mainSet).addAll(currentSet); + checkMixedScripts("main+auxiliary", combined, result); + if (currentSet.containsSome(mainSet)) { + UnicodeSet overlap = + new UnicodeSet(mainSet) + .retainAll(currentSet) + .removeAll(ExemplarSets.HangulSyllables); + if (!overlap.isEmpty()) { + String fixedExemplar1 = rawFormatter.format(overlap); + result.add( + new CheckStatus() + .setCause(this) + .setMainType(CheckStatus.errorType) + .setSubtype(Subtype.auxiliaryExemplarsOverlap) + .setMessage( + "Auxiliary characters also exist in main: \u200E{0}\u200E", + fixedExemplar1)); + } + } + } + + private void checkPunctuation(UnicodeSet currentSet, List result) { + // Check that the punctuation exemplar characters include quotation marks. + UnicodeSet quoteSet = new UnicodeSet(); + for (String element : QUOTE_ELEMENTS) { + quoteSet.add( + getResolvedCldrFileToCheck().getWinningValue("//ldml/delimiters/" + element)); + } + if (!currentSet.containsAll(quoteSet)) { + quoteSet.removeAll(currentSet); + // go ahead and list the characters separately, with space between, for + // clarity. + StringBuilder characters = new StringBuilder(); + for (String item : quoteSet) { + if (characters.length() != 0) { + characters.append(" "); + } + characters.append(item); + } + CheckStatus message = + new CheckStatus() + .setCause(this) + .setMainType(CheckStatus.warningType) + .setSubtype(Subtype.missingPunctuationCharacters) + .setMessage( + "Punctuation exemplar characters are missing quotation marks for this locale: {0}", + characters); + result.add(message); + } + } + + private void checkIndex(UnicodeSet currentSet, UnicodeSet mainSet, List result) { + // Check that the index exemplar characters are in case-completed union of main + // and auxiliary exemplars + UnicodeSet auxiliarySet2 = + getResolvedCldrFileToCheck() + .getExemplarSet(ExemplarType.auxiliary, CLDRFile.WinningChoice.WINNING); + if (auxiliarySet2 == null) { + auxiliarySet2 = new UnicodeSet(); + } + UnicodeSet mainAndAuxAllCase = + new UnicodeSet(mainSet) + .addAll(auxiliarySet2) + .closeOver(UnicodeSet.ADD_CASE_MAPPINGS); + UnicodeSet indexBadChars = currentSet.removeAll(mainAndAuxAllCase); + + if (!indexBadChars.isEmpty()) { + CheckStatus message = + new CheckStatus() + .setCause(this) + .setMainType(CheckStatus.warningType) + .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars) + .setMessage( + "Index exemplars include characters not in main or auxiliary exemplars: {0}", + indexBadChars.toPattern(false)); + result.add(message); + } + } + private void checkParse(String path, String value, List result) { if (value == null) { CheckStatus message = @@ -361,17 +274,6 @@ private void checkParse(String path, String value, List result) { } } - static final BitSet Japn = new BitSet(); - static final BitSet Kore = new BitSet(); - - static { - Japn.set(UScript.HAN); - Japn.set(UScript.HIRAGANA); - Japn.set(UScript.KATAKANA); - Kore.set(UScript.HAN); - Kore.set(UScript.HANGUL); - } - private void checkMixedScripts(String title, UnicodeSet set, List result) { BitSet s = new BitSet(); for (String item : set) { @@ -388,7 +290,7 @@ private void checkMixedScripts(String title, UnicodeSet set, List r return; // allow 2 scripts in exemplars for currencies. } // allowable combinations - if (s.equals(Japn) || s.equals(Kore)) { + if (s.equals(ExemplarSets.Jpan) || s.equals(ExemplarSets.Kore)) { return; } StringBuilder scripts = new StringBuilder(); @@ -422,7 +324,8 @@ private void checkMixedScripts(String title, UnicodeSet set, List r .setMessage("{0} exemplars contain multiple scripts: {1}", title, scripts)); } - private void checkExemplar(String v, List result, ExemplarType exemplarType) { + private void checkExemplar( + String v, List result, ExemplarSets.ExemplarType exemplarType) { if (v == null) return; final UnicodeSet exemplar1; try { @@ -452,8 +355,7 @@ private void checkExemplar(String v, List result, ExemplarType exem .setMainType(CheckStatus.errorType) .setSubtype(Subtype.internalUnicodeSetFormattingError) .setMessage( - "Internal Error: formatting not working for {0}", - new Object[] {exemplar1})); + "Internal Error: formatting not working for {0}", exemplar1)); } // else if (!v.equals(fixedExemplar1)) { // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType) @@ -517,7 +419,9 @@ private void checkExemplar(String v, List result, ExemplarType exem "Exemplar set (" + exemplarType + ") must not be empty -- that would imply that this language uses no " - + (exemplarType == ExemplarType.punctuation + + (exemplarType + == ExemplarSets.ExemplarType + .punctuation ? "punctuation" : "letters") + "!")); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java index c77215a3ece..53f075ad9cd 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java @@ -37,6 +37,8 @@ import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRFile.Status; import org.unicode.cldr.util.DateConstants; +import org.unicode.cldr.util.ExemplarSets; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.InternalCldrException; import org.unicode.cldr.util.LocaleIDParser; @@ -224,7 +226,7 @@ public CheckCLDR handleSetCldrFileToCheck( CLDRFile resolvedFile = getResolvedCldrFileToCheck(); boolean[] ok = new boolean[1]; - exemplars = safeGetExemplars("", possibleErrors, resolvedFile, ok); + exemplars = safeGetExemplars(ExemplarType.main, possibleErrors, resolvedFile, ok); if (exemplars == null) { CheckStatus item = @@ -251,7 +253,7 @@ public CheckCLDR handleSetCldrFileToCheck( // if (temp != null) exemplars.addAll(temp); UnicodeSet auxiliary = safeGetExemplars( - "auxiliary", + ExemplarType.auxiliary, possibleErrors, resolvedFile, ok); // resolvedFile.getExemplarSet("auxiliary", @@ -260,25 +262,7 @@ public CheckCLDR handleSetCldrFileToCheck( exemplars.addAll(auxiliary); } - if (CheckExemplars.USE_PUNCTUATION) { - UnicodeSet punctuation = - safeGetExemplars( - "punctuation", - possibleErrors, - resolvedFile, - ok); // resolvedFile.getExemplarSet("auxiliary", - if (punctuation != null) { - exemplars.addAll(punctuation); - } - - UnicodeSet numbers = getNumberSystemExemplars(); - exemplars.addAll(numbers); - - // TODO fix replacement character - exemplars.add(STAND_IN); - } - - exemplars.addAll(CheckExemplars.AlwaysOK).addAll(LB_JOIN_CONTROLS).freeze(); + exemplars.addAll(ExemplarSets.AlwaysOK).addAll(LB_JOIN_CONTROLS).freeze(); exemplarsPlusAscii = new UnicodeSet(exemplars).addAll(ASCII).freeze(); skip = false; @@ -294,7 +278,10 @@ private UnicodeSet getNumberSystemExemplars() { } private UnicodeSet safeGetExemplars( - String type, List possibleErrors, CLDRFile resolvedFile, boolean[] ok) { + ExemplarType type, + List possibleErrors, + CLDRFile resolvedFile, + boolean[] ok) { UnicodeSet result = null; try { result = resolvedFile.getExemplarSet(type, CLDRFile.WinningChoice.WINNING); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java index 98480d28c9f..0baf5d7f9bc 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java @@ -31,7 +31,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.unicode.cldr.test.CheckExemplars.ExemplarType; import org.unicode.cldr.util.AnnotationUtil; import org.unicode.cldr.util.Builder; import org.unicode.cldr.util.CLDRConfig; @@ -42,6 +41,7 @@ import org.unicode.cldr.util.DateTimeCanonicalizer; import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType; import org.unicode.cldr.util.Emoji; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.LocaleNames; import org.unicode.cldr.util.PatternCache; import org.unicode.cldr.util.SimpleUnicodeSetFormatter; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/ExampleGenerator.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/ExampleGenerator.java index 42408da5c64..ee83a6ba0ee 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/ExampleGenerator.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/ExampleGenerator.java @@ -54,7 +54,6 @@ import org.unicode.cldr.util.AnnotationUtil; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRFileOverride; import org.unicode.cldr.util.CLDRLocale; @@ -64,6 +63,7 @@ import org.unicode.cldr.util.DayPeriodInfo; import org.unicode.cldr.util.DayPeriodInfo.DayPeriod; import org.unicode.cldr.util.EmojiConstants; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.GrammarInfo; import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature; import org.unicode.cldr.util.GrammarInfo.GrammaticalScope; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java index cd66f3da4cc..4316b124815 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java @@ -48,7 +48,6 @@ import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRFile.DraftStatus; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.NumberingSystem; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRLocale; @@ -61,6 +60,7 @@ import org.unicode.cldr.util.DowngradePaths; import org.unicode.cldr.util.DtdData; import org.unicode.cldr.util.DtdType; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.FileProcessor; import org.unicode.cldr.util.GlossonymConstructor; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartCollation.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartCollation.java index fba82cc34ed..7a74467a8e2 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartCollation.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartCollation.java @@ -23,10 +23,10 @@ import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRFile.DraftStatus; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.NumberingSystem; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.FileCopier; import org.unicode.cldr.util.NameGetter; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareExemplars.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareExemplars.java index cffe45206c9..60593adc899 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareExemplars.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareExemplars.java @@ -14,8 +14,8 @@ import java.util.TreeMap; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.WinningChoice; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.UnicodeRelation; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/DeriveScripts.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/DeriveScripts.java index 01481996e23..7fe85b2ae56 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/DeriveScripts.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/DeriveScripts.java @@ -17,6 +17,7 @@ import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.Iso639Data; import org.unicode.cldr.util.LanguageTagCanonicalizer; @@ -114,7 +115,8 @@ public class DeriveScripts { + ")", nsde); } - UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet exemplars = + cldrFile.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); for (String s : exemplars) { int scriptNum = UScript.getScript(s.codePointAt(0)); if (scriptNum != UScript.COMMON diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ExtractMessages.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ExtractMessages.java index 752a96f9de0..4233356260c 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ExtractMessages.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ExtractMessages.java @@ -21,6 +21,7 @@ import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.CollatorHelper; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.NameGetter; import org.unicode.cldr.util.NameType; @@ -274,7 +275,8 @@ public void setLocale(String locale) { throw new RuntimeException("Skipping " + locale); } cldrFile = cldrFactory.make(locale, false); - UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet exemplars = + cldrFile.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); usesLatin = exemplars != null && exemplars.containsSome(LATIN_SCRIPT); for (DataHandler dataHandler : dataHandlers) { dataHandler.reset(cldrFile); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateCasingChart.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateCasingChart.java index c1648b609df..33693725a95 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateCasingChart.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateCasingChart.java @@ -21,6 +21,7 @@ import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.ChainedMap; import org.unicode.cldr.util.ChainedMap.M3; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.Level; import org.unicode.cldr.util.StandardCodes; @@ -103,7 +104,8 @@ public static void main(String[] args) { continue; } CLDRFile cldrFile = factory.make(locale, true); - UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet exemplars = + cldrFile.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); M3 data = ChainedMap.of( diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateIndexCharacters.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateIndexCharacters.java index d806b8c8a35..806209d1339 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateIndexCharacters.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateIndexCharacters.java @@ -9,11 +9,11 @@ import java.util.List; import java.util.Set; import org.unicode.cldr.draft.FileUtilities; -import org.unicode.cldr.test.CheckExemplars.ExemplarType; import org.unicode.cldr.test.DisplayAndInputProcessor; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.SimpleFactory; @@ -44,7 +44,7 @@ public static String getConstructedIndexSet(String locale, CLDRFile cFile) { // ICU. AlphabeticIndex index = new AlphabeticIndex<>(uLocale); index.clearRecords(); - UnicodeSet indexLabels = cFile.getExemplarSet("index", WinningChoice.WINNING); + UnicodeSet indexLabels = cFile.getExemplarSet(ExemplarType.index, WinningChoice.WINNING); if (indexLabels != null && indexLabels.size() > 0) { index.addLabels(indexLabels); } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ListRedundantUnicodeSets.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ListRedundantUnicodeSets.java index b8c5b54edfc..9bbfcd360e4 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ListRedundantUnicodeSets.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ListRedundantUnicodeSets.java @@ -16,9 +16,9 @@ import java.util.TreeSet; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRLocale; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.ICUServiceBuilder; import org.unicode.cldr.util.Level; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLanguages.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLanguages.java index 808fe69ebfd..894088f623b 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLanguages.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLanguages.java @@ -59,6 +59,7 @@ import org.unicode.cldr.util.CLDRTool; import org.unicode.cldr.util.CLDRURLS; import org.unicode.cldr.util.CldrUtility; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.FileCopier; import org.unicode.cldr.util.Iso639Data; @@ -619,7 +620,8 @@ private static void addLanguageScriptCells2( } catch (RuntimeException e) { scriptSet = new UnicodeSet(); } - UnicodeSet exemplars = nativeLanguage.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet exemplars = + nativeLanguage.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); if (scriptSet.containsNone(exemplars)) { System.out.println( "Skipping CLDR file -- exemplars differ: " diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRFile.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRFile.java index a30585ac6ec..91792edd8e7 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRFile.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRFile.java @@ -52,6 +52,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature; import org.unicode.cldr.util.LocaleInheritanceInfo.Reason; import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; @@ -2369,36 +2370,12 @@ public CLDRFile makeDraft(DraftStatus draftStatus) { return this; } - public UnicodeSet getExemplarSet(String type, WinningChoice winningChoice) { - return getExemplarSet(type, winningChoice, UnicodeSet.CASE_INSENSITIVE); - } - - public UnicodeSet getExemplarSet(ExemplarType type, WinningChoice winningChoice) { - return getExemplarSet(type, winningChoice, UnicodeSet.CASE_INSENSITIVE); - } - static final UnicodeSet HACK_CASE_CLOSURE_SET = new UnicodeSet( "[ſẛffẞ{i̇}\u1F71\u1F73\u1F75\u1F77\u1F79\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB\u1FE3\u1FEB\u1FF9\u1FFB\u2126\u212A\u212B]") .freeze(); - public enum ExemplarType { - main, - auxiliary, - index, - punctuation, - numbers; - - public static ExemplarType fromString(String type) { - return type.isEmpty() ? main : valueOf(type); - } - } - - public UnicodeSet getExemplarSet(String type, WinningChoice winningChoice, int option) { - return getExemplarSet(ExemplarType.fromString(type), winningChoice, option); - } - - public UnicodeSet getExemplarSet(ExemplarType type, WinningChoice winningChoice, int option) { + public UnicodeSet getExemplarSet(ExemplarType type, WinningChoice winningChoice) { UnicodeSet result = getRawExemplarSet(type, winningChoice); if (result.isEmpty()) { return result.cloneAsThawed(); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CoreCoverageInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CoreCoverageInfo.java index b82d4143f59..af4cfd0c9c3 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CoreCoverageInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CoreCoverageInfo.java @@ -20,7 +20,7 @@ import org.unicode.cldr.draft.ScriptMetadata.Info; import org.unicode.cldr.draft.ScriptMetadata.Trinary; import org.unicode.cldr.tool.LikelySubtags; -import org.unicode.cldr.util.CLDRFile.ExemplarType; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Iso639Data.Type; import org.unicode.cldr.util.SupplementalDataInfo.PluralType; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/ExemplarSets.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/ExemplarSets.java new file mode 100644 index 00000000000..27faa2f921d --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/ExemplarSets.java @@ -0,0 +1,121 @@ +package org.unicode.cldr.util; + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UnicodeSet; +import java.util.BitSet; +import java.util.Locale; + +public class ExemplarSets { + + public static final UnicodeSet AlwaysOK = + new UnicodeSet( + "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" + + "[[؉][་ །༌][ཱ]‎‎{য়}য়]" + + // TODO Fix this Hack; reference: + // https://unicode-org.atlassian.net/browse/CLDR-19115 + "-[❮❯]]") + .freeze(); // [\\u200c-\\u200f] + // [:script=common:][:script=inherited:] + + public static final UnicodeSet HangulSyllables = + new UnicodeSet("[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze(); + public static final BitSet Jpan = new BitSet(); + public static final BitSet Kore = new BitSet(); + + /* + TODO: all of this should be dynamic, using the script data as below: + + + ... + Reference: https://unicode-org.atlassian.net/browse/CLDR-18950 + */ + static { + ExemplarSets.Jpan.set(UScript.HAN); + ExemplarSets.Jpan.set(UScript.HIRAGANA); + ExemplarSets.Jpan.set(UScript.KATAKANA); + ExemplarSets.Kore.set(UScript.HAN); + ExemplarSets.Kore.set(UScript.HANGUL); + } + + // TODO Fix some of these characters + private static final UnicodeSet SPECIAL_ALLOW = + new UnicodeSet( + "[\u061C\\u200E\\u200F\\u200c\\u200d" + + "‎‎‎[\u064B\u064E-\u0651\u0670]‎[:Nd:]‎[\u0951\u0952]‎[\u064B-\u0652\u0654-\u0657\u0670]‎[\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652]‎[\\u02BB\\u02BC][\u0CE6-\u0CEF]‎‎[\u0966-\u096F]" + + "‎‎‎[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore + // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] + ) + .freeze(); // add RLM, LRM [\u200C\u200D]‎ + private static final UnicodeSet UAllowedInExemplars = + new UnicodeSet( + "[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] + .removeAll(AlwaysOK) // this will remove some + // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we + // restore them + // in SPECIAL_ALLOW + .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎ + .freeze(); + private static final UnicodeSet AllowedInExemplars = + new UnicodeSet(UAllowedInExemplars) + .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]")) + .freeze(); + + private static final UnicodeSet UAllowedInNumbers = + new UnicodeSet( + "[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] + .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎ + .freeze(); + private static final UnicodeSet ALLOWED_IN_NUMBERS_NOT_IN_MAIN = + new UnicodeSet("[[:Numeric_Type=Decimal:]]").freeze(); + + private static final UnicodeSet ALLOWED_IN_MAIN = + new UnicodeSet(AllowedInExemplars).removeAll(ALLOWED_IN_NUMBERS_NOT_IN_MAIN).freeze(); + + private static final UnicodeSet ALLOWED_IN_PUNCTUATION = + new UnicodeSet("[[:P:][:S:]-[:Sc:]]").freeze(); + private static final UnicodeSet ALLOWED_IN_AUX = + new UnicodeSet(AllowedInExemplars) + .addAll(ALLOWED_IN_PUNCTUATION) + .removeAll(AlwaysOK) // this will remove some + // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we + // restore them + // in SPECIAL_ALLOW + .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎ + .freeze(); + + public enum ExemplarType { + main( + ALLOWED_IN_MAIN, + "(specific-script - uppercase - invisibles - numbers + \u0130)", + true), + auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", true), + punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", false), + punctuation_auxiliary(ALLOWED_IN_PUNCTUATION, "punctuation-auxiliary", false), + punctuation_person(ALLOWED_IN_PUNCTUATION, "punctuation-person", false), + numbers(UAllowedInNumbers, "(specific-script - invisibles)", false), + numbers_auxiliary(UAllowedInNumbers, "(specific-script - invisibles)", false), + index(UAllowedInExemplars, "(specific-script - invisibles)", false), + ; + + public final UnicodeSet allowed; + public final UnicodeSet toRemove; + public final String message; + public final boolean convertUppercase; + + ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) { + if (!allowed.isFrozen()) { + throw new IllegalArgumentException("Internal Error"); + } + this.allowed = allowed; + this.message = message; + this.toRemove = new UnicodeSet(allowed).complement().freeze(); + this.convertUppercase = convertUppercase; + } + + public static ExemplarType from(String name) { + return name == null || name.isEmpty() + ? ExemplarType.main + : ExemplarType.valueOf(name.replace('-', '_').toLowerCase(Locale.ROOT)); + } + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleScriptInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleScriptInfo.java index a22429a3c86..5a01f64e62a 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleScriptInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleScriptInfo.java @@ -6,8 +6,8 @@ import com.ibm.icu.text.UnicodeSetIterator; import java.util.Map; import java.util.Set; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.WinningChoice; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/ValuePathStatus.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/ValuePathStatus.java index 517e20866da..0173989831b 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/ValuePathStatus.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/ValuePathStatus.java @@ -5,6 +5,7 @@ import com.ibm.icu.util.Output; import java.util.List; import org.unicode.cldr.util.CLDRFile.WinningChoice; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo; import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; @@ -20,7 +21,7 @@ public enum MissingOK { public static final UnicodeSet LATIN = new UnicodeSet("[:sc=Latn:]").freeze(); public static boolean isLatinScriptLocale(CLDRFile sourceFile) { - UnicodeSet main = sourceFile.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet main = sourceFile.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); return LATIN.containsSome(main); } diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java index 8536e681ac5..513a94d6afa 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java @@ -30,7 +30,6 @@ import org.unicode.cldr.tool.ToolConstants; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CLDRLocale; import org.unicode.cldr.util.CalculatedCoverageLevels; @@ -38,6 +37,7 @@ import org.unicode.cldr.util.ChainedMap.M3; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.Containment; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.LanguageTagParser; import org.unicode.cldr.util.Level; diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java index 7b6d7a17b9a..ada7c2708f2 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java @@ -69,6 +69,7 @@ import org.unicode.cldr.util.DtdType; import org.unicode.cldr.util.DtdType.DtdStatus; import org.unicode.cldr.util.ElementAttributeInfo; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.InputStreamFactory; import org.unicode.cldr.util.LanguageTagParser; @@ -421,8 +422,8 @@ public void TestCurrencyFallback() { final UnicodeSet OK_CURRENCY_FALLBACK = new UnicodeSet("[\\u0000-\\u00FF]") - .addAll(safeExemplars(file, "")) - .addAll(safeExemplars(file, "auxiliary")) + .addAll(safeExemplars(file, ExemplarType.main)) + .addAll(safeExemplars(file, ExemplarType.auxiliary)) .freeze(); UnicodeSet badSoFar = new UnicodeSet(); @@ -754,8 +755,8 @@ private static NumberFormat getCurrencyInstance(ULocale locale, int type) { return format; } - private UnicodeSet safeExemplars(CLDRFile file, String string) { - final UnicodeSet result = file.getExemplarSet(string, WinningChoice.NORMAL); + private UnicodeSet safeExemplars(CLDRFile file, ExemplarType type) { + final UnicodeSet result = file.getExemplarSet(type, WinningChoice.NORMAL); return result != null ? result : new UnicodeSet(); } @@ -1144,18 +1145,19 @@ public void TestCoreData() { logln(name + " is missing " + MissingType.plurals.toString()); warnings.put(MissingType.plurals, "missing"); } - UnicodeSet main = cldrFile.getExemplarSet("", WinningChoice.WINNING); + UnicodeSet main = cldrFile.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); if (main == null || main.isEmpty()) { errln(" " + name + " is missing " + MissingType.main_exemplars.toString()); errors.put(MissingType.main_exemplars, "missing"); } - UnicodeSet index = cldrFile.getExemplarSet("index", WinningChoice.WINNING); + UnicodeSet index = + cldrFile.getExemplarSet(ExemplarType.index, WinningChoice.WINNING); if (index == null || index.isEmpty()) { logln(name + " is missing " + MissingType.index_exemplars.toString()); warnings.put(MissingType.index_exemplars, "missing"); } UnicodeSet punctuation = - cldrFile.getExemplarSet("punctuation", WinningChoice.WINNING); + cldrFile.getExemplarSet(ExemplarType.punctuation, WinningChoice.WINNING); if (punctuation == null || punctuation.isEmpty()) { logln(name + " is missing " + MissingType.punct_exemplars.toString()); warnings.put(MissingType.punct_exemplars, "missing"); diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java index 3f79a327bc3..91b7dbcee50 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java @@ -14,8 +14,8 @@ import org.unicode.cldr.test.DisplayAndInputProcessor.PathSpaceType; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CodePointEscaper; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; public class TestDisplayAndInputProcessor extends TestFmwk { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java index 7c412bf7b2d..fb33433b91b 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java @@ -60,6 +60,7 @@ import org.unicode.cldr.util.CLDRURLS; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.DateConstants; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.GrammarInfo; import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature; import org.unicode.cldr.util.GrammarInfo.GrammaticalScope; @@ -898,7 +899,7 @@ public void TestDefaultScriptCompleteness() { continue; } CLDRFile cldrFile = testInfo.getCLDRFile(locale, false); - UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.NORMAL); + UnicodeSet set = cldrFile.getExemplarSet(ExemplarType.main, WinningChoice.NORMAL); for (String s : set) { int script = UScript.getScript(s.codePointAt(0)); if (script != UScript.UNKNOWN diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/UnicodeSetPrettyPrinterTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/UnicodeSetPrettyPrinterTest.java index 85df269c53f..88d066348cf 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/UnicodeSetPrettyPrinterTest.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/UnicodeSetPrettyPrinterTest.java @@ -21,9 +21,9 @@ import org.unicode.cldr.icu.dev.test.TestFmwk; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; import org.unicode.cldr.util.CLDRFile.WinningChoice; import org.unicode.cldr.util.CodePointEscaper; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.Joiners; import org.unicode.cldr.util.Level; diff --git a/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/WikiSubdivisionLanguages.java b/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/WikiSubdivisionLanguages.java index 896f722558b..29b7e7e7d64 100644 --- a/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/WikiSubdivisionLanguages.java +++ b/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/WikiSubdivisionLanguages.java @@ -44,6 +44,7 @@ import org.unicode.cldr.util.ChainedMap.M4; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.Counter; +import org.unicode.cldr.util.ExemplarSets.ExemplarType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.PatternCache; import org.unicode.cldr.util.SimpleXMLSource; @@ -227,9 +228,11 @@ public static void main(String[] args) throws IOException { Multimap inverse = LinkedHashMultimap.create(); CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse); - UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0); - UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING); - UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING); + UnicodeSet main = file.getExemplarSet(ExemplarType.main, WinningChoice.WINNING); + UnicodeSet auxiliary = + file.getExemplarSet(ExemplarType.auxiliary, WinningChoice.WINNING); + UnicodeSet punctuation = + file.getExemplarSet(ExemplarType.punctuation, WinningChoice.WINNING); UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem); exemplars = new UnicodeSet()