@@ -256,7 +256,13 @@ public void put(
256256 Merge <String > merger ,
257257 boolean hackHangul ,
258258 UnicodeProperty nextVersion ) {
259- // MEOW
259+ if (value == null && property == UcdProperty .Idn_2008 ) {
260+ // The IDNA2008 Status field of the IDNA mapping table is treated as an enumerated
261+ // property by the tools, with an Extra @missing line with a value na.
262+ // Unusually, the file has data lines with no IDNA2008 Status field; the default should
263+ // apply to these ranges.
264+ return ;
265+ }
260266 if (value != null
261267 && value .isEmpty ()
262268 && property != UcdProperty .NFKC_Casefold
@@ -409,8 +415,8 @@ public String checkRegex2(String string) {
409415 public String checkEnum (String string ) {
410416 final Enum item = string == null ? null : property .getEnum (string );
411417 if (item == null ) {
412- final String errorMessage = property + " \t Bad enum value: \t " + string ;
413- IndexUnicodeProperties . getDataLoadingErrors (). put ( property , errorMessage );
418+ throw new UnicodePropertyException (
419+ " \t Bad enum value for " + property + " : \t " + string );
414420 } else {
415421 string = item .toString ();
416422 }
@@ -746,9 +752,37 @@ private static void parsePropertyValueFile(
746752 IndexUnicodeProperties indexUnicodeProperties ,
747753 IndexUnicodeProperties nextProperties ) {
748754 for (UcdLineParser .UcdLine line : parser ) {
755+ if (line .getOriginalLine ().equals ("U+" )
756+ && filename .startsWith ("Unihan" )
757+ && indexUnicodeProperties .ucdVersion .getMajor () == 2 ) {
758+ // Truncated Unihan-1.txt in Unicode 2.0.
759+ return ;
760+ }
749761 String propName = line .getParts ()[1 ];
750762 UcdProperty item = UcdProperty .forString (propName );
751763
764+ String extractedValue = null ;
765+ if (item == null
766+ && indexUnicodeProperties .ucdVersion .compareTo (VersionInfo .UNICODE_4_0 ) <= 0 ) {
767+ // DerivedNormalizationProps Version 4.0 and earlier is highly irregular.
768+ // It provides the NFMeow_QC assignments as a single field NFMEOW_Value,
769+ // and calls FC_NFKC_Closure FNC. Since we need special handling for the former,
770+ // we also deal with FNC here instead of making it an Extra alias.
771+ if (propName .equals ("FNC" )) {
772+ propName = "FC_NFKC_Closure" ;
773+ item = UcdProperty .forString (propName );
774+ } else {
775+ String [] parts = propName .split ("_" );
776+ if (parts .length == 2
777+ && (parts [1 ].equals ("NO" ) | parts [1 ].equals ("MAYBE" ))
778+ && (parts [0 ].startsWith ("NF" ))) {
779+ propName = parts [0 ] + "_QC" ;
780+ item = UcdProperty .forString (propName );
781+ extractedValue = parts [1 ];
782+ }
783+ }
784+ }
785+
752786 if (item == null ) {
753787 throw new IllegalArgumentException (
754788 "Missing property enum in UcdProperty for "
@@ -773,9 +807,10 @@ private static void parsePropertyValueFile(
773807 // }
774808 String value ;
775809 // The file emoji-sequences.txt has a comment-like field after the binary property.
776- if (line .getParts ().length == 2
777- || filename .equals ("emoji/*/emoji-sequences" )
778- || filename .equals ("emoji/*/emoji-zwj-sequences" )) {
810+ if (extractedValue == null
811+ && (line .getParts ().length == 2
812+ || filename .equals ("emoji/*/emoji-sequences" )
813+ || filename .equals ("emoji/*/emoji-zwj-sequences" ))) {
779814 if (propInfo .property .getType () != PropertyType .Binary ) {
780815 throw new IllegalArgumentException (
781816 "Expected a value for "
@@ -785,7 +820,14 @@ private static void parsePropertyValueFile(
785820 }
786821 value = "Yes" ;
787822 } else {
788- value = line .getParts ()[2 ];
823+ value = extractedValue != null ? extractedValue : line .getParts ()[2 ];
824+ if (propInfo .property == UcdProperty .kJapaneseOn
825+ && indexUnicodeProperties .ucdVersion .equals (VersionInfo .UNICODE_3_1_0 )
826+ && value .isEmpty ()
827+ && line .getParts ().length == 4 ) {
828+ // Extra tab in the kJapaneseOn record for U+4E00.
829+ value = line .getParts ()[3 ];
830+ }
789831 if (propInfo .property .getType () == PropertyType .Binary ) {
790832 if (line .getType () == Contents .DATA
791833 && UcdPropertyValues .Binary .forName (value )
@@ -817,7 +859,10 @@ private static void parsePropertyValueFile(
817859 && !(propInfo .property == UcdProperty .NFKC_Casefold
818860 || propInfo .property == UcdProperty .NFKC_Simple_Casefold )) {
819861 throw new IllegalArgumentException (
820- "Unexpected empty value for property " + propName );
862+ "Unexpected empty value for property "
863+ + propName
864+ + ": "
865+ + line .getOriginalLine ());
821866 }
822867 if (propInfo .property == UcdProperty .kMandarin ) {
823868 if (indexUnicodeProperties .oldVersion ) {
@@ -941,26 +986,6 @@ private static void parseConfusablesFile(
941986 intRange ,
942987 parts [1 ],
943988 nextProperties == null ? null : nextProperties .getProperty (propInfo .property ));
944- intRange .set (parts [1 ]);
945- if (intRange .string == null ) {
946- if (!data .containsKey (intRange .start )) {
947- propInfo .put (
948- data ,
949- intRange ,
950- parts [1 ],
951- nextProperties == null
952- ? null
953- : nextProperties .getProperty (propInfo .property ));
954- }
955- } else if (!intRange .string .isEmpty () && !data .containsKey (intRange .string )) {
956- propInfo .put (
957- data ,
958- intRange ,
959- parts [1 ],
960- nextProperties == null
961- ? null
962- : nextProperties .getProperty (propInfo .property ));
963- }
964989 }
965990 }
966991
@@ -1111,7 +1136,12 @@ private static void parseFields(
11111136 throw new UnicodePropertyException ();
11121137 }
11131138 String value =
1114- propInfo .fieldNumber >= parts .length ? "" : parts [propInfo .fieldNumber ];
1139+ propInfo .fieldNumber >= parts .length ? null : parts [propInfo .fieldNumber ];
1140+ if (propInfo .property == UcdProperty .Joining_Group
1141+ && indexUnicodeProperties .ucdVersion .compareTo (VersionInfo .UNICODE_4_0 ) <= 0
1142+ && value .equals ("<no shaping>" )) {
1143+ value = "No_Joining_Group" ;
1144+ }
11151145 propInfo .put (
11161146 data ,
11171147 line .getMissingSet (),
@@ -1172,16 +1202,32 @@ private static void parseSimpleFieldFile(
11721202 IntRange range = new IntRange ();
11731203 range .start = Utility .codePointFromHex (line .getParts ()[0 ]);
11741204 range .end = Utility .codePointFromHex (line .getParts ()[1 ]);
1205+ // Unicode 2 puts FEFF both in Arabic Presentation Forms-B and in Specials.
1206+ // We are not going to make Block multivalued for that, so we let the second
1207+ // assignment win.
1208+ // This fits with assignments in Unicode 2.1.4..3.1.1 where
1209+ // Arabic Presentation Forms-B ended on FEFE and Specials was a
1210+ // split Block of FEFF & FFF0..FFFD.
1211+ // Since Unicode 3.2, blocks were contiguous xxx0..yyyF:
1212+ // https://www.unicode.org/reports/tr28/tr28-3.html#database
1213+ // The normative blocks defined in Blocks.txt have been adjusted slightly,
1214+ // in accordance with Unicode Technical Committee decisions.
1215+ // - Every block starts and ends on a column boundary.
1216+ // That is, the last digit of the first code point in the block is always 0,
1217+ // and the last digit of the final code point in the block is always F.
1218+ // - Every block is contiguous. [...]
11751219 propInfo .put (
11761220 data ,
11771221 line .getMissingSet (),
11781222 range ,
11791223 line .getParts ()[2 ],
1180- null ,
1224+ version . getMajor () == 2 ? new PropertyUtilities . Overrider () : null ,
11811225 false ,
11821226 nextVersion );
11831227 continue ;
1184- } else if (line .getParts ().length != 2 ) {
1228+ } else if (line .getParts ().length != 2
1229+ && version .compareTo (VersionInfo .UNICODE_3_0 ) > 0 ) {
1230+ // Unicode 3.0 and earlier had name comments as an extra field.
11851231 throw new IllegalArgumentException (
11861232 "Too many fields in " + line .getOriginalLine ());
11871233 }
0 commit comments