Skip to content

Commit 61259f2

Browse files
eggrobinmarkusicu
andauthored
Actually check values of enumerated properties (#1083)
Co-authored-by: Markus Scherer <[email protected]>
1 parent c7f0fdb commit 61259f2

File tree

9 files changed

+368
-50
lines changed

9 files changed

+368
-50
lines changed

unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -925,11 +925,7 @@ public static void loadUcdHistory(
925925
final var properties = IndexUnicodeProperties.make(age.getShortName());
926926
for (UcdProperty property : UcdProperty.values()) {
927927
if (property.getShortName().startsWith("cjk") == unihan) {
928-
try {
929-
properties.load(property, expectCacheHit);
930-
} catch (ICUException e) {
931-
e.printStackTrace();
932-
}
928+
properties.load(property, expectCacheHit);
933929
}
934930
}
935931
System.out.println(

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 77 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,13 @@ public void put(
256256
Merge<String> merger,
257257
boolean hackHangul,
258258
UnicodeProperty nextVersion) {
259-
// MEOW
259+
if (value == null && property == UcdProperty.Idn_2008) {
260+
// The IDNA2008 Status field of the IDNA mapping table is treated as an enumerated
261+
// property by the tools, with an Extra @missing line with a value na.
262+
// Unusually, the file has data lines with no IDNA2008 Status field; the default should
263+
// apply to these ranges.
264+
return;
265+
}
260266
if (value != null
261267
&& value.isEmpty()
262268
&& property != UcdProperty.NFKC_Casefold
@@ -409,8 +415,8 @@ public String checkRegex2(String string) {
409415
public String checkEnum(String string) {
410416
final Enum item = string == null ? null : property.getEnum(string);
411417
if (item == null) {
412-
final String errorMessage = property + "\tBad enum value:\t" + string;
413-
IndexUnicodeProperties.getDataLoadingErrors().put(property, errorMessage);
418+
throw new UnicodePropertyException(
419+
"\tBad enum value for " + property + " :\t" + string);
414420
} else {
415421
string = item.toString();
416422
}
@@ -746,9 +752,37 @@ private static void parsePropertyValueFile(
746752
IndexUnicodeProperties indexUnicodeProperties,
747753
IndexUnicodeProperties nextProperties) {
748754
for (UcdLineParser.UcdLine line : parser) {
755+
if (line.getOriginalLine().equals("U+")
756+
&& filename.startsWith("Unihan")
757+
&& indexUnicodeProperties.ucdVersion.getMajor() == 2) {
758+
// Truncated Unihan-1.txt in Unicode 2.0.
759+
return;
760+
}
749761
String propName = line.getParts()[1];
750762
UcdProperty item = UcdProperty.forString(propName);
751763

764+
String extractedValue = null;
765+
if (item == null
766+
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_4_0) <= 0) {
767+
// DerivedNormalizationProps Version 4.0 and earlier is highly irregular.
768+
// It provides the NFMeow_QC assignments as a single field NFMEOW_Value,
769+
// and calls FC_NFKC_Closure FNC. Since we need special handling for the former,
770+
// we also deal with FNC here instead of making it an Extra alias.
771+
if (propName.equals("FNC")) {
772+
propName = "FC_NFKC_Closure";
773+
item = UcdProperty.forString(propName);
774+
} else {
775+
String[] parts = propName.split("_");
776+
if (parts.length == 2
777+
&& (parts[1].equals("NO") | parts[1].equals("MAYBE"))
778+
&& (parts[0].startsWith("NF"))) {
779+
propName = parts[0] + "_QC";
780+
item = UcdProperty.forString(propName);
781+
extractedValue = parts[1];
782+
}
783+
}
784+
}
785+
752786
if (item == null) {
753787
throw new IllegalArgumentException(
754788
"Missing property enum in UcdProperty for "
@@ -773,9 +807,10 @@ private static void parsePropertyValueFile(
773807
// }
774808
String value;
775809
// The file emoji-sequences.txt has a comment-like field after the binary property.
776-
if (line.getParts().length == 2
777-
|| filename.equals("emoji/*/emoji-sequences")
778-
|| filename.equals("emoji/*/emoji-zwj-sequences")) {
810+
if (extractedValue == null
811+
&& (line.getParts().length == 2
812+
|| filename.equals("emoji/*/emoji-sequences")
813+
|| filename.equals("emoji/*/emoji-zwj-sequences"))) {
779814
if (propInfo.property.getType() != PropertyType.Binary) {
780815
throw new IllegalArgumentException(
781816
"Expected a value for "
@@ -785,7 +820,14 @@ private static void parsePropertyValueFile(
785820
}
786821
value = "Yes";
787822
} else {
788-
value = line.getParts()[2];
823+
value = extractedValue != null ? extractedValue : line.getParts()[2];
824+
if (propInfo.property == UcdProperty.kJapaneseOn
825+
&& indexUnicodeProperties.ucdVersion.equals(VersionInfo.UNICODE_3_1_0)
826+
&& value.isEmpty()
827+
&& line.getParts().length == 4) {
828+
// Extra tab in the kJapaneseOn record for U+4E00.
829+
value = line.getParts()[3];
830+
}
789831
if (propInfo.property.getType() == PropertyType.Binary) {
790832
if (line.getType() == Contents.DATA
791833
&& UcdPropertyValues.Binary.forName(value)
@@ -817,7 +859,10 @@ private static void parsePropertyValueFile(
817859
&& !(propInfo.property == UcdProperty.NFKC_Casefold
818860
|| propInfo.property == UcdProperty.NFKC_Simple_Casefold)) {
819861
throw new IllegalArgumentException(
820-
"Unexpected empty value for property " + propName);
862+
"Unexpected empty value for property "
863+
+ propName
864+
+ ": "
865+
+ line.getOriginalLine());
821866
}
822867
if (propInfo.property == UcdProperty.kMandarin) {
823868
if (indexUnicodeProperties.oldVersion) {
@@ -941,26 +986,6 @@ private static void parseConfusablesFile(
941986
intRange,
942987
parts[1],
943988
nextProperties == null ? null : nextProperties.getProperty(propInfo.property));
944-
intRange.set(parts[1]);
945-
if (intRange.string == null) {
946-
if (!data.containsKey(intRange.start)) {
947-
propInfo.put(
948-
data,
949-
intRange,
950-
parts[1],
951-
nextProperties == null
952-
? null
953-
: nextProperties.getProperty(propInfo.property));
954-
}
955-
} else if (!intRange.string.isEmpty() && !data.containsKey(intRange.string)) {
956-
propInfo.put(
957-
data,
958-
intRange,
959-
parts[1],
960-
nextProperties == null
961-
? null
962-
: nextProperties.getProperty(propInfo.property));
963-
}
964989
}
965990
}
966991

@@ -1111,7 +1136,12 @@ private static void parseFields(
11111136
throw new UnicodePropertyException();
11121137
}
11131138
String value =
1114-
propInfo.fieldNumber >= parts.length ? "" : parts[propInfo.fieldNumber];
1139+
propInfo.fieldNumber >= parts.length ? null : parts[propInfo.fieldNumber];
1140+
if (propInfo.property == UcdProperty.Joining_Group
1141+
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_4_0) <= 0
1142+
&& value.equals("<no shaping>")) {
1143+
value = "No_Joining_Group";
1144+
}
11151145
propInfo.put(
11161146
data,
11171147
line.getMissingSet(),
@@ -1172,16 +1202,32 @@ private static void parseSimpleFieldFile(
11721202
IntRange range = new IntRange();
11731203
range.start = Utility.codePointFromHex(line.getParts()[0]);
11741204
range.end = Utility.codePointFromHex(line.getParts()[1]);
1205+
// Unicode 2 puts FEFF both in Arabic Presentation Forms-B and in Specials.
1206+
// We are not going to make Block multivalued for that, so we let the second
1207+
// assignment win.
1208+
// This fits with assignments in Unicode 2.1.4..3.1.1 where
1209+
// Arabic Presentation Forms-B ended on FEFE and Specials was a
1210+
// split Block of FEFF & FFF0..FFFD.
1211+
// Since Unicode 3.2, blocks were contiguous xxx0..yyyF:
1212+
// https://www.unicode.org/reports/tr28/tr28-3.html#database
1213+
// The normative blocks defined in Blocks.txt have been adjusted slightly,
1214+
// in accordance with Unicode Technical Committee decisions.
1215+
// - Every block starts and ends on a column boundary.
1216+
// That is, the last digit of the first code point in the block is always 0,
1217+
// and the last digit of the final code point in the block is always F.
1218+
// - Every block is contiguous. [...]
11751219
propInfo.put(
11761220
data,
11771221
line.getMissingSet(),
11781222
range,
11791223
line.getParts()[2],
1180-
null,
1224+
version.getMajor() == 2 ? new PropertyUtilities.Overrider() : null,
11811225
false,
11821226
nextVersion);
11831227
continue;
1184-
} else if (line.getParts().length != 2) {
1228+
} else if (line.getParts().length != 2
1229+
&& version.compareTo(VersionInfo.UNICODE_3_0) > 0) {
1230+
// Unicode 3.0 and earlier had name comments as an extra field.
11851231
throw new IllegalArgumentException(
11861232
"Too many fields in " + line.getOriginalLine());
11871233
}

unicodetools/src/main/java/org/unicode/props/PropertyUtilities.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,15 @@ public String merge(String first, String second) {
2828
}
2929
}
3030

31+
public static final class Overrider implements Merge<String> {
32+
public Overrider() {}
33+
34+
@Override
35+
public String merge(String first, String second) {
36+
return second;
37+
}
38+
}
39+
3140
static final <K, V, M extends Map<K, V>> M putNew(M map, K key, V value) {
3241
final V oldValue = map.get(key);
3342
if (oldValue != null) {

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,12 @@ public enum UcdProperty {
362362
Quotation_Mark(PropertyType.Binary, Binary.class, null, "QMark"),
363363
RGI_Emoji_Flag_Sequence(PropertyType.Binary, Binary.class, null, "REFS", "Emoji_Flag_Sequence"),
364364
RGI_Emoji_Keycap_Sequence(
365-
PropertyType.Binary, Binary.class, null, "REKS", "Emoji_Keycap_Sequence"),
365+
PropertyType.Binary,
366+
Binary.class,
367+
null,
368+
"REKS",
369+
"Emoji_Keycap_Sequence",
370+
"Emoji_Combining_Sequence"),
366371
RGI_Emoji_Modifier_Sequence(
367372
PropertyType.Binary, Binary.class, null, "REMS", "Emoji_Modifier_Sequence"),
368373
RGI_Emoji_Tag_Sequence(PropertyType.Binary, Binary.class, null, "RETS", "Emoji_Tag_Sequence"),

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 121 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,105 @@ public enum Canonical_Combining_Class_Values implements Named {
610610
Above_Right("232", "AR"),
611611
Double_Below("233", "DB"),
612612
Double_Above("234", "DA"),
613-
Iota_Subscript("240", "IS");
613+
Iota_Subscript("240", "IS"),
614+
CCC37("37"),
615+
CCC38("38"),
616+
CCC39("39"),
617+
CCC40("40"),
618+
CCC41("41"),
619+
CCC42("42"),
620+
CCC43("43"),
621+
CCC44("44"),
622+
CCC45("45"),
623+
CCC46("46"),
624+
CCC47("47"),
625+
CCC48("48"),
626+
CCC49("49"),
627+
CCC50("50"),
628+
CCC51("51"),
629+
CCC52("52"),
630+
CCC53("53"),
631+
CCC54("54"),
632+
CCC55("55"),
633+
CCC56("56"),
634+
CCC57("57"),
635+
CCC58("58"),
636+
CCC59("59"),
637+
CCC60("60"),
638+
CCC61("61"),
639+
CCC62("62"),
640+
CCC63("63"),
641+
CCC64("64"),
642+
CCC65("65"),
643+
CCC66("66"),
644+
CCC67("67"),
645+
CCC68("68"),
646+
CCC69("69"),
647+
CCC70("70"),
648+
CCC71("71"),
649+
CCC72("72"),
650+
CCC73("73"),
651+
CCC74("74"),
652+
CCC75("75"),
653+
CCC76("76"),
654+
CCC77("77"),
655+
CCC78("78"),
656+
CCC79("79"),
657+
CCC80("80"),
658+
CCC81("81"),
659+
CCC82("82"),
660+
CCC83("83"),
661+
CCC85("85"),
662+
CCC86("86"),
663+
CCC87("87"),
664+
CCC88("88"),
665+
CCC89("89"),
666+
CCC90("90"),
667+
CCC92("92"),
668+
CCC93("93"),
669+
CCC94("94"),
670+
CCC95("95"),
671+
CCC96("96"),
672+
CCC97("97"),
673+
CCC98("98"),
674+
CCC99("99"),
675+
CCC100("100"),
676+
CCC101("101"),
677+
CCC102("102"),
678+
CCC104("104"),
679+
CCC105("105"),
680+
CCC106("106"),
681+
CCC108("108"),
682+
CCC109("109"),
683+
CCC110("110"),
684+
CCC111("111"),
685+
CCC112("112"),
686+
CCC113("113"),
687+
CCC114("114"),
688+
CCC115("115"),
689+
CCC116("116"),
690+
CCC117("117"),
691+
CCC119("119"),
692+
CCC120("120"),
693+
CCC121("121"),
694+
CCC123("123"),
695+
CCC124("124"),
696+
CCC125("125"),
697+
CCC126("126"),
698+
CCC127("127"),
699+
CCC128("128"),
700+
CCC131("131"),
701+
CCC134("134"),
702+
CCC135("135"),
703+
CCC136("136"),
704+
CCC137("137"),
705+
CCC138("138"),
706+
CCC139("139"),
707+
CCC140("140"),
708+
CCC141("141"),
709+
CCC142("142"),
710+
CCC143("143"),
711+
CCC144("144");
614712
private final PropertyNames<Canonical_Combining_Class_Values> names;
615713

616714
private Canonical_Combining_Class_Values(String shortName, String... otherNames) {
@@ -703,7 +801,8 @@ public enum Do_Not_Emit_Type_Values implements Named {
703801
Precomposed_Form("Precomposed_Form"),
704802
Deprecated("Deprecated"),
705803
Discouraged("Discouraged"),
706-
Preferred_Spelling("Preferred_Spelling");
804+
Preferred_Spelling("Preferred_Spelling"),
805+
Arabic_Tashkil("Arabic_Tashkil");
707806
private final PropertyNames<Do_Not_Emit_Type_Values> names;
708807

709808
private Do_Not_Emit_Type_Values(String shortName, String... otherNames) {
@@ -942,7 +1041,7 @@ public static Identifier_Status_Values forName(String name) {
9421041
}
9431042

9441043
public enum Identifier_Type_Values implements Named {
945-
Not_Character("nc", "not_chars"),
1044+
Not_Character("nc", "not_chars", "Not_Characters"),
9461045
Deprecated("d"),
9471046
Default_Ignorable("di"),
9481047
Not_NFKC("nn"),
@@ -1093,7 +1192,8 @@ public enum Indic_Positional_Category_Values implements Named {
10931192
Top_And_Left("Top_And_Left"),
10941193
Top_And_Left_And_Right("Top_And_Left_And_Right"),
10951194
Top_And_Right("Top_And_Right"),
1096-
Visual_Order_Left("Visual_Order_Left");
1195+
Visual_Order_Left("Visual_Order_Left"),
1196+
Invisible("Invisible");
10971197
private final PropertyNames<Indic_Positional_Category_Values> names;
10981198

10991199
private Indic_Positional_Category_Values(String shortName, String... otherNames) {
@@ -1157,7 +1257,8 @@ public enum Indic_Syllabic_Category_Values implements Named {
11571257
Visarga("Visarga"),
11581258
Vowel("Vowel"),
11591259
Vowel_Dependent("Vowel_Dependent"),
1160-
Vowel_Independent("Vowel_Independent");
1260+
Vowel_Independent("Vowel_Independent"),
1261+
Consonant_Repha("Consonant_Repha");
11611262
private final PropertyNames<Indic_Syllabic_Category_Values> names;
11621263

11631264
private Indic_Syllabic_Category_Values(String shortName, String... otherNames) {
@@ -1370,7 +1471,21 @@ public enum Joining_Group_Values implements Named {
13701471
Yudh("Yudh"),
13711472
Yudh_He("Yudh_He"),
13721473
Zain("Zain"),
1373-
Zhain("Zhain");
1474+
Zhain("Zhain"),
1475+
BAA("BAA"),
1476+
FA("FA"),
1477+
HAA("HAA"),
1478+
HA_GOAL("HA_GOAL"),
1479+
HA("HA"),
1480+
CAF("CAF"),
1481+
KNOTTED_HA("KNOTTED_HA"),
1482+
RA("RA"),
1483+
SWASH_CAF("SWASH_CAF"),
1484+
HAMZAH_ON_HA_GOAL("HAMZAH_ON_HA_GOAL"),
1485+
TAA_MARBUTAH("TAA_MARBUTAH"),
1486+
YA_BARREE("YA_BARREE"),
1487+
YA("YA"),
1488+
ALEF_MAQSURAH("ALEF_MAQSURAH");
13741489
private final PropertyNames<Joining_Group_Values> names;
13751490

13761491
private Joining_Group_Values(String shortName, String... otherNames) {

0 commit comments

Comments
 (0)