Skip to content

Commit 84bf4cb

Browse files
authored
Tweaks to Unihan property handling (#1022)
* kZhuang * Mark multivalued Unihan properties that actually have multiple values as multivalued * GenerateEnums * Revert "Mark multivalued Unihan properties that actually have multiple values as multivalued" This reverts commit bacfe60. * Update the correct file and remove files that do nothing * GenerateEnums * space * Throw semicolons at the problem * meow * Add to index * Somehow the code in UCD.java becomes a little bit cleaner * These raw maps are awful * Name collision * … just remove the dead code * spotless
1 parent abc84a7 commit 84bf4cb

File tree

11 files changed

+100
-366
lines changed

11 files changed

+100
-366
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java

Lines changed: 0 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import com.google.common.collect.Multimap;
55
import com.google.common.collect.TreeMultimap;
66
import com.ibm.icu.impl.UnicodeMap;
7-
import com.ibm.icu.lang.UCharacter;
87
import com.ibm.icu.lang.UProperty.NameChoice;
98
import com.ibm.icu.text.CollationElementIterator;
109
import com.ibm.icu.text.Normalizer;
@@ -568,89 +567,6 @@ protected String _getValue(int codepoint) {
568567
}
569568
}
570569

571-
private static class IcuEnumProperty extends XEnumUnicodeProperty {
572-
final int propNum;
573-
574-
public IcuEnumProperty(int propNum) {
575-
super(
576-
UCharacter.getPropertyName(propNum, NameChoice.LONG),
577-
getValues(propNum).toArray());
578-
this.propNum = propNum;
579-
}
580-
581-
private static List<String> getValues(int propNum) {
582-
List<String> valueList = new ArrayList<String>();
583-
for (int i = UCharacter.getIntPropertyMinValue(propNum);
584-
i <= UCharacter.getIntPropertyMaxValue(propNum);
585-
++i) {
586-
valueList.add(UCharacter.getPropertyValueName(propNum, i, NameChoice.LONG));
587-
}
588-
return valueList;
589-
}
590-
591-
@Override
592-
protected String _getValue(int codepoint) {
593-
int propValue = UCharacter.getIntPropertyValue(codepoint, propNum);
594-
try {
595-
return UCharacter.getPropertyValueName(propNum, propValue, NameChoice.LONG);
596-
} catch (Exception e) {
597-
return "n/a";
598-
}
599-
}
600-
}
601-
602-
// private static class IcuBidiPairedBracket extends SimpleProperty {
603-
// final int propNum;
604-
// public IcuBidiPairedBracket() {
605-
// setName(UCharacter.getPropertyName(UProperty.BIDI_PAIRED_BRACKET,
606-
// NameChoice.LONG));
607-
// this.propNum = UProperty.BIDI_PAIRED_BRACKET;
608-
// }
609-
// @Override
610-
// public List _getNameAliases(List result) {
611-
// return Arrays.asList(UCharacter.getPropertyName(propNum, NameChoice.LONG),
612-
// UCharacter.getPropertyName(propNum, NameChoice.SHORT));
613-
// }
614-
//
615-
// @Override
616-
// protected String _getValue(int codepoint) {
617-
// return UTF16.valueOf(UCharacter.getBidiPairedBracket(codepoint));
618-
// }
619-
// @Override
620-
// protected UnicodeMap _getUnicodeMap() {
621-
// // TODO Auto-generated method stub
622-
// return super._getUnicodeMap();
623-
// }
624-
// }
625-
626-
// private static class Usage extends XEnumUnicodeProperty {
627-
// enum UsageValues {common, historic, deprecated, liturgical, limited, symbol,
628-
// punctuation, na;
629-
// public static UsageValues getValue(int codepoint) {
630-
// if (UnicodeProperty.SPECIALS.contains(codepoint)) return na;
631-
// if (UnicodeUtilities.DEPRECATED.contains(codepoint)) return deprecated;
632-
// if (UnicodeUtilities.LITURGICAL.contains(codepoint)) return liturgical;
633-
// //if (ScriptCategoriesCopy.ARCHAIC.contains(codepoint)) return historic;
634-
// //if (UnicodeUtilities.LIM.contains(codepoint)) return archaic;
635-
// if (UnicodeUtilities.COMMON_USE_SCRIPTS.contains(codepoint)) {
636-
// if (UnicodeUtilities.SYMBOL.contains(codepoint)) return symbol;
637-
// if (UnicodeUtilities.PUNCTUATION.contains(codepoint)) return punctuation;
638-
// return common;
639-
// }
640-
// return limited;
641-
// }
642-
// }
643-
// public Usage() {
644-
// super("Usage", UsageValues.values());
645-
// setType(UnicodeProperty.EXTENDED_ENUMERATED);
646-
// }
647-
//
648-
// @Override
649-
// protected String _getValue(int codepoint) {
650-
// return UsageValues.getValue(codepoint).toString();
651-
// }
652-
// }
653-
654570
static class HanType extends XEnumUnicodeProperty {
655571
enum HanTypeValues {
656572
na,

unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.EnumMap;
2929
import java.util.EnumSet;
3030
import java.util.HashMap;
31+
import java.util.HashSet;
3132
import java.util.LinkedHashSet;
3233
import java.util.List;
3334
import java.util.Locale;
@@ -263,7 +264,7 @@ public UnicodeMap<Double> loadDouble(UcdProperty prop2) {
263264
|| prop2 == UcdProperty.kAccountingNumeric
264265
|| prop2 == UcdProperty.kOtherNumeric) {
265266
// Unicode 15.1+: A character may have multiple Unihan numeric values.
266-
pos = v.indexOf(' ');
267+
pos = v.indexOf('|');
267268
if (pos >= 0) {
268269
v = value.substring(0, pos);
269270
}
@@ -839,11 +840,21 @@ public List<String> _getNameAliases(List result) {
839840
}
840841

841842
@Override
842-
protected List<String> _getAvailableValues(List result) {
843+
protected List<String> _getAvailableValues(List<String> result) {
843844
if (stringToNamedEnum != null) {
844845
result.addAll(enumValueNames);
845846
return result;
846847
}
848+
if (isMultivalued()) {
849+
HashSet<String> valueSet = new HashSet<>();
850+
for (var value : _getUnicodeMap().getAvailableValues()) {
851+
for (var part : delimiterSplitter.split(value)) {
852+
valueSet.add(part);
853+
}
854+
}
855+
result.addAll(valueSet);
856+
return result;
857+
}
847858
return _getUnicodeMap().getAvailableValues(result);
848859
}
849860

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ public enum UcdProperty {
4848
Numeric_Value(PropertyType.Numeric, "nv"),
4949
kAccountingNumeric(PropertyType.Numeric, "cjkAccountingNumeric"),
5050
kOtherNumeric(PropertyType.Numeric, "cjkOtherNumeric"),
51-
kPrimaryNumeric(PropertyType.Numeric, "cjkPrimaryNumeric"),
51+
kPrimaryNumeric(PropertyType.Numeric, null, ValueCardinality.Ordered, "cjkPrimaryNumeric"),
5252

5353
// String
5454
Bidi_Mirroring_Glyph(PropertyType.String, "bmg"),
@@ -91,7 +91,11 @@ public enum UcdProperty {
9191
Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"),
9292
Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"),
9393
Unicode_1_Name(PropertyType.Miscellaneous, "na1"),
94-
kAlternateTotalStrokes(PropertyType.Miscellaneous, "cjkAlternateTotalStrokes"),
94+
kAlternateTotalStrokes(
95+
PropertyType.Miscellaneous,
96+
null,
97+
ValueCardinality.Unordered,
98+
"cjkAlternateTotalStrokes"),
9599
kBigFive(PropertyType.Miscellaneous, "cjkBigFive"),
96100
kCCCII(PropertyType.Miscellaneous, "cjkCCCII"),
97101
kCNS1986(PropertyType.Miscellaneous, "cjkCNS1986"),
@@ -114,7 +118,7 @@ public enum UcdProperty {
114118
kEH_IFAO(PropertyType.Miscellaneous, "kEH_IFAO"),
115119
kEH_JSesh(PropertyType.Miscellaneous, "kEH_JSesh"),
116120
kEH_UniK(PropertyType.Miscellaneous, "kEH_UniK"),
117-
kFanqie(PropertyType.Miscellaneous, "cjkFanqie"),
121+
kFanqie(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFanqie"),
118122
kFenn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFenn"),
119123
kFennIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFennIndex"),
120124
kFourCornerCode(
@@ -154,7 +158,7 @@ public enum UcdProperty {
154158
kIRG_VSource(PropertyType.Miscellaneous, "cjkIRG_VSource"),
155159
kJIS0213(PropertyType.Miscellaneous, "cjkJIS0213"),
156160
kJa(PropertyType.Miscellaneous, "cjkJa"),
157-
kJapanese(PropertyType.Miscellaneous, "cjkJapanese"),
161+
kJapanese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapanese"),
158162
kJapaneseKun(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseKun"),
159163
kJapaneseOn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseOn"),
160164
kJinmeiyoKanji(
@@ -180,7 +184,7 @@ public enum UcdProperty {
180184
kMandarin(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkMandarin"),
181185
kMatthews(PropertyType.Miscellaneous, "cjkMatthews"),
182186
kMeyerWempe(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkMeyerWempe"),
183-
kMojiJoho(PropertyType.Miscellaneous, "cjkMojiJoho"),
187+
kMojiJoho(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkMojiJoho"),
184188
kMorohashi(PropertyType.Miscellaneous, "cjkMorohashi"),
185189
kNelson(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkNelson"),
186190
kPhonetic(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkPhonetic"),
@@ -201,31 +205,37 @@ public enum UcdProperty {
201205
"URS"),
202206
kReading(PropertyType.Miscellaneous, "kReading"),
203207
kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"),
204-
kSMSZD2003Index(PropertyType.Miscellaneous, "cjkSMSZD2003Index"),
205-
kSMSZD2003Readings(PropertyType.Miscellaneous, "cjkSMSZD2003Readings"),
208+
kSMSZD2003Index(
209+
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"),
210+
kSMSZD2003Readings(
211+
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"),
206212
kSemanticVariant(
207213
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"),
208214
kSpecializedSemanticVariant(
209215
PropertyType.Miscellaneous,
210216
null,
211217
ValueCardinality.Unordered,
212218
"cjkSpecializedSemanticVariant"),
213-
kSpoofingVariant(PropertyType.Miscellaneous, "cjkSpoofingVariant"),
219+
kSpoofingVariant(
220+
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"),
214221
kSrc_NushuDuben(PropertyType.Miscellaneous, "kSrc_NushuDuben"),
215-
kStrange(PropertyType.Miscellaneous, "cjkStrange"),
222+
kStrange(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkStrange"),
216223
kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"),
217-
kTGHZ2013(PropertyType.Miscellaneous, "cjkTGHZ2013"),
224+
kTGHZ2013(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGHZ2013"),
218225
kTGT_MergedSrc(PropertyType.Miscellaneous, "kTGT_MergedSrc"),
219226
kTaiwanTelegraph(PropertyType.Miscellaneous, "cjkTaiwanTelegraph"),
220227
kTang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTang"),
221228
kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"),
222229
kUnihanCore2020(PropertyType.Miscellaneous, "cjkUnihanCore2020"),
223230
kVietnamese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnamese"),
224-
kVietnameseNumeric(PropertyType.Miscellaneous, "cjkVietnameseNumeric"),
231+
kVietnameseNumeric(
232+
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"),
225233
kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"),
226234
kXerox(PropertyType.Miscellaneous, "cjkXerox"),
227235
kZVariant(PropertyType.Miscellaneous, "cjkZVariant"),
228-
kZhuangNumeric(PropertyType.Miscellaneous, "cjkZhuangNumeric"),
236+
kZhuang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuang"),
237+
kZhuangNumeric(
238+
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"),
229239

230240
// Catalog
231241
Age(PropertyType.Catalog, Age_Values.class, null, "age"),

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1579,6 +1579,7 @@ public static kEH_Core_Values forName(String name) {
15791579
// kVietnameseNumeric
15801580
// kXerox
15811581
// kXHC1983
1582+
// kZhuang
15821583
// kZhuangNumeric
15831584
// kZVariant
15841585
public enum Line_Break_Values implements Named {

unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.util.ArrayList;
2121
import java.util.Arrays;
2222
import java.util.Collection;
23+
import java.util.Collections;
2324
import java.util.Comparator;
2425
import java.util.HashMap;
2526
import java.util.HashSet;
@@ -157,7 +158,7 @@ public static synchronized void ResetCacheProperties() {
157158
private boolean isMultivalued = false;
158159

159160
private String delimiter = ",";
160-
private Splitter delimiterSplitter = Splitter.on(delimiter);
161+
protected Splitter delimiterSplitter = Splitter.on(delimiter);
161162

162163
public UnicodeProperty setMultivalued(boolean value) {
163164
isMultivalued = value;
@@ -263,6 +264,12 @@ public String getVersion() {
263264
return _getVersion();
264265
}
265266

267+
public Iterable<String> getValues(int codepoint) {
268+
return isMultivalued
269+
? delimiterSplitter.split(getValue(codepoint))
270+
: Collections.singleton(getValue(codepoint));
271+
}
272+
266273
public String getValue(int codepoint) {
267274
if (DEBUG && CHECK_VALUE == codepoint && CHECK_NAME.equals(getName())) {
268275
String value = _getValue(codepoint);
@@ -290,8 +297,10 @@ public List<String> getValueAliases(String valueAlias, List<String> result) {
290297
if (result == null) result = new ArrayList<>(1);
291298
result = _getValueAliases(valueAlias, result);
292299
if (!result.contains(valueAlias)) { // FIX && type < NUMERIC
293-
if (type == MISC) {
300+
if (type == MISC || type == NUMERIC) {
294301
// Unihan has multivalued properties but does not use aliases.
302+
// The concept of aliases does not really apply to numeric properties,
303+
// but we should apply UAX44-LM1. We don’t, though.
295304
result.add(valueAlias);
296305
} else {
297306
result = _getValueAliases(valueAlias, result); // for debugging

unicodetools/src/main/java/org/unicode/text/UCD/UCD.java

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -522,41 +522,28 @@ static class HanException {
522522
}
523523

524524
private void populateHanExceptions(UnicodeProperty numeric) {
525-
for (String value : numeric.getAvailableValues()) {
526-
if (value == null || value.equals("NaN")) {
527-
continue;
528-
}
529-
String propertyValue = Utility.replace(value, ",", "");
530-
final int hack = propertyValue.indexOf(' ');
531-
if (hack >= 0) {
532-
Utility.fixDot();
533-
if (SHOW_LOADING) {
534-
System.out.println("BAD NUMBER: " + value);
535-
}
536-
propertyValue = propertyValue.substring(0, hack);
537-
}
538-
539-
for (String s : numeric.getSet(value)) {
540-
final int code = s.codePointAt(0);
541-
// Unicode 15.1:
542-
// This code had these two exceptions, but now U+4EAC actually has value
543-
// 10000000000000000
544-
// and we want to see that in DerivedNumericValues.txt,
545-
// so we stop making these exceptions.
546-
if (compositeVersion < 0xf0100 && (code == 0x5793 || code == 0x4EAC)) {
547-
continue; // two exceptions!!
548-
}
549-
550-
HanException except = (HanException) hanExceptions.get(code);
551-
if (except != null) {
552-
throw new IllegalArgumentException(
553-
"Duplicate Numeric Value for U+" + Utility.hex(code));
554-
}
555-
except = new HanException();
556-
hanExceptions.put(code, except);
557-
except.numericValue = Double.parseDouble(propertyValue);
558-
except.numericType = NUMERIC;
525+
for (final int code : numeric.getSet("NaN").complement().codePoints()) {
526+
// Unicode 15.1:
527+
// This code had these two exceptions, but now U+4EAC actually has value
528+
// 10000000000000000
529+
// and we want to see that in DerivedNumericValues.txt,
530+
// so we stop making these exceptions.
531+
// NOTE(egg): These two exceptions (we are in a function called exceptions, so these are
532+
// exceptions to the broader exception that is Han numeric values) were made irrelevant
533+
// sometime before Unicode 5.2. See L2/03-094 for background.
534+
if (compositeVersion < 0xf0100 && (code == 0x5793 || code == 0x4EAC)) {
535+
continue; // two exceptions!!
536+
}
537+
538+
HanException except = (HanException) hanExceptions.get(code);
539+
if (except != null && false) {
540+
throw new IllegalArgumentException(
541+
"Duplicate Numeric Value for U+" + Utility.hex(code));
559542
}
543+
except = new HanException();
544+
hanExceptions.put(code, except);
545+
except.numericValue = Double.parseDouble(numeric.getValues(code).iterator().next());
546+
except.numericType = NUMERIC;
560547
}
561548
}
562549

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ cjkVietnameseNumeric ; kVietnameseNumeric
167167
cjkZhuangNumeric ; kZhuangNumeric
168168
# 16.0
169169
cjkFanqie ; kFanqie
170+
cjkZhuang ; kZhuang
170171

171172
kTGT_MergedSrc ; kTGT_MergedSrc
172173
kRSTUnicode ; kRSTUnicode

unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ kCCCII ; EXTENSIBLE ; [0-9A-F]{6}
105105
kEACC ; SINGLE_VALUED ; [0-9A-F]{6}
106106
kAccountingNumeric ; SINGLE_VALUED ; [0-9]+
107107
kOtherNumeric ; SINGLE_VALUED ; [0-9]+
108-
kPrimaryNumeric ; SINGLE_VALUED ; [0-9]+
108+
kPrimaryNumeric ; ORDERED ; [0-9]+
109109
kFenn ; MULTI_VALUED ; [0-9]+a?[A-KP*]
110110
kCowles ; MULTI_VALUED ; [0-9]{1,4}(\.[0-9]{1,2})?
111111
kXerox ; SINGLE_VALUED ; [0-9]{3}:[0-9]{3}
@@ -176,11 +176,29 @@ kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2}
176176
kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})*
177177
kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3}
178178

179-
180179
kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}
181180
kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}
182181

183-
182+
# Unihan properties from 13.0 and later. No regexes for now.
183+
# TODO(egg): We should automate the updating of the regexes from UAX #38.
184+
kSpoofingVariant ; MULTI_VALUED ; .*
185+
kTGHZ2013 ; MULTI_VALUED ; .*
186+
kUnihanCore2020 ; SINGLE_VALUED ; .*
187+
# 14.0
188+
kStrange ; MULTI_VALUED ; .*
189+
# 15.0
190+
kAlternateTotalStrokes ; MULTI_VALUED ; .*
191+
# 15.1
192+
kJapanese ; MULTI_VALUED ; .*
193+
kMojiJoho ; MULTI_VALUED ; .*
194+
kSMSZD2003Index ; MULTI_VALUED ; .*
195+
kSMSZD2003Readings ; MULTI_VALUED ; .*
196+
kVietnameseNumeric ; MULTI_VALUED ; .*
197+
kZhuangNumeric ; MULTI_VALUED ; .*
198+
# 16.0
199+
kFanqie ; MULTI_VALUED ; .*
200+
kZhuang ; MULTI_VALUED ; .*
201+
184202
# =============================
185203
# Catalog/Enum/Binary Properties
186204
# All not listed are SINGLE_VALUED ; null

0 commit comments

Comments
 (0)