Skip to content

Commit 5118852

Browse files
authored
Correctly parse Numeric_Value since 1.0 (#1123)
1 parent fa8a895 commit 5118852

File tree

5 files changed

+131
-34
lines changed

5 files changed

+131
-34
lines changed

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 113 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,21 @@ enum SpecialProperty {
4040
Skip1FT,
4141
Skip1ST,
4242
SkipAny4,
43-
Rational
4443
}
4544

4645
private static final String NEW_UNICODE_PROPS_DOCS =
4746
"https://github.com/unicode-org/unicodetools/blob/main/docs/newunicodeproperties.md";
4847
private static final VersionInfo MIN_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
4948
public final UcdProperty property;
50-
public final int fieldNumber;
5149
public final SpecialProperty special;
5250

51+
/**
52+
* Maps from Unicode versions to field number. A property whose field number depends on the
53+
* version has more than one entry. A particular field number applies to the Unicode versions
54+
* after the previous-version entry, up to and including its own version.
55+
*/
56+
TreeMap<VersionInfo, Integer> fieldNumbers;
57+
5358
/**
5459
* Maps from Unicode versions to files. A property whose file depends on the version has more
5560
* than one entry. A particular file applies to the Unicode versions after the previous-version
@@ -102,7 +107,8 @@ public PropertyParsingInfo(
102107
this.files = new TreeMap<>();
103108
files.put(Settings.LATEST_VERSION_INFO, file);
104109
this.property = property;
105-
this.fieldNumber = fieldNumber;
110+
this.fieldNumbers = new TreeMap<>();
111+
fieldNumbers.put(Settings.LATEST_VERSION_INFO, fieldNumber);
106112
this.special = special;
107113
}
108114

@@ -121,6 +127,15 @@ private static void fromStrings(String... propertyInfo) {
121127
}
122128

123129
String last = propertyInfo[propertyInfo.length - 1];
130+
131+
int temp = 1;
132+
if (propertyInfo.length > 2
133+
&& !propertyInfo[2].isEmpty()
134+
&& !VERSION.matcher(propertyInfo[2]).matches()) {
135+
temp = Integer.parseInt(propertyInfo[2]);
136+
}
137+
int _fieldNumber = temp;
138+
124139
if (VERSION.matcher(last).matches()) {
125140
propertyInfo[propertyInfo.length - 1] = "";
126141
PropertyParsingInfo result = property2PropertyInfo.get(_property);
@@ -129,16 +144,11 @@ private static void fromStrings(String... propertyInfo) {
129144
"No modern info for property with old file record: " + propName);
130145
}
131146
result.files.put(VersionInfo.getInstance(last.substring(1)), _file);
147+
result.fieldNumbers.put(VersionInfo.getInstance(last.substring(1)), _fieldNumber);
132148
file2PropertyInfoSet.put(_file, result);
133149
return;
134150
}
135151

136-
int temp = 1;
137-
if (propertyInfo.length > 2 && !propertyInfo[2].isEmpty()) {
138-
temp = Integer.parseInt(propertyInfo[2]);
139-
}
140-
int _fieldNumber = temp;
141-
142152
SpecialProperty _special =
143153
propertyInfo.length < 4 || propertyInfo[3].isEmpty()
144154
? SpecialProperty.None
@@ -173,7 +183,7 @@ public String toString() {
173183
+ " ;\t"
174184
+ property
175185
+ " ;\t"
176-
+ fieldNumber
186+
+ fieldNumbers
177187
+ " ;\t"
178188
+ special
179189
+ " ;\t"
@@ -200,7 +210,8 @@ public int compareTo(PropertyParsingInfo arg0) {
200210
if (0 != (result = property.toString().compareTo(arg0.property.toString()))) {
201211
return result;
202212
}
203-
return fieldNumber - arg0.fieldNumber;
213+
return fieldNumbers.get(Settings.LATEST_VERSION_INFO)
214+
- arg0.fieldNumbers.get(Settings.LATEST_VERSION_INFO);
204215
}
205216

206217
public static String getFullFileName(UcdProperty prop, VersionInfo ucdVersion) {
@@ -227,6 +238,20 @@ public String getFileName(VersionInfo ucdVersionRequested) {
227238
}
228239
}
229240

241+
public int getFieldNumber(VersionInfo ucdVersionRequested) {
242+
int fieldNumber = 0;
243+
if (fieldNumbers.size() == 1) {
244+
return fieldNumbers.values().iterator().next();
245+
}
246+
for (final var entry : fieldNumbers.entrySet()) {
247+
if (ucdVersionRequested.compareTo(entry.getKey()) <= 0) {
248+
fieldNumber = entry.getValue();
249+
break;
250+
}
251+
}
252+
return fieldNumber;
253+
}
254+
230255
private static final VersionInfo V13 = VersionInfo.getInstance(13);
231256

232257
public static final Normalizer2 NFD = Normalizer2.getNFDInstance();
@@ -595,12 +620,11 @@ static void parseSourceFile(
595620
if (propInfoSet.size() == 1
596621
&& (propInfo = propInfoSet.iterator().next()).special
597622
== SpecialProperty.None
598-
&& propInfo.fieldNumber == 1) {
623+
&& propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) == 1) {
599624
parseSimpleFieldFile(
600625
parser.withMissing(true),
601626
propInfo,
602-
indexUnicodeProperties.property2UnicodeMap.get(propInfo.property),
603-
indexUnicodeProperties.ucdVersion,
627+
indexUnicodeProperties,
604628
nextProperties == null
605629
? null
606630
: nextProperties.getProperty(propInfo.property));
@@ -1255,18 +1279,6 @@ private static void parseFields(
12551279
switch (propInfo.special) {
12561280
case None:
12571281
break;
1258-
case Rational:
1259-
// int slashPos = string.indexOf('/');
1260-
// double rational;
1261-
// if (slashPos < 0) {
1262-
// rational = Double.parseDouble(string);
1263-
// } else {
1264-
// rational =
1265-
// Double.parseDouble(string.substring(0,slashPos)) /
1266-
// Double.parseDouble(string.substring(slashPos+1));
1267-
// }
1268-
// string = Double.toString(rational);
1269-
break;
12701282
case Skip1ST:
12711283
if ("ST".contains(parts[1])) {
12721284
continue;
@@ -1286,7 +1298,9 @@ private static void parseFields(
12861298
throw new UnicodePropertyException();
12871299
}
12881300
String value =
1289-
propInfo.fieldNumber >= parts.length ? null : parts[propInfo.fieldNumber];
1301+
propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) >= parts.length
1302+
? null
1303+
: parts[propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion)];
12901304
if (propInfo.property == UcdProperty.Joining_Group
12911305
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_4_0_1)
12921306
<= 0
@@ -1326,7 +1340,9 @@ private static void parseFields(
13261340
} else {
13271341
for (final PropertyParsingInfo propInfo : propInfoSet) {
13281342
final String value =
1329-
propInfo.fieldNumber < parts.length ? parts[propInfo.fieldNumber] : null;
1343+
propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) < parts.length
1344+
? parts[propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion)]
1345+
: null;
13301346
setPropDefault(
13311347
propInfo.property,
13321348
value,
@@ -1340,9 +1356,11 @@ private static void parseFields(
13401356
private static void parseSimpleFieldFile(
13411357
UcdLineParser parser,
13421358
PropertyParsingInfo propInfo,
1343-
UnicodeMap<String> data,
1344-
VersionInfo version,
1359+
IndexUnicodeProperties indexUnicodeProperties,
13451360
UnicodeProperty nextVersion) {
1361+
final UnicodeMap<String> data =
1362+
indexUnicodeProperties.property2UnicodeMap.get(propInfo.property);
1363+
final VersionInfo version = indexUnicodeProperties.ucdVersion;
13461364
for (UcdLineParser.UcdLine line : parser) {
13471365
if (line.getType() == UcdLineParser.UcdLine.Contents.DATA) {
13481366
if (propInfo.getDefaultValue(version) == null) {
@@ -1395,6 +1413,55 @@ private static void parseSimpleFieldFile(
13951413
false,
13961414
nextVersion);
13971415
continue;
1416+
} else if (propInfo.property == UcdProperty.Numeric_Value) {
1417+
String extractedValue = line.getParts()[1];
1418+
for (int cp = line.getRange().start; cp <= line.getRange().end; ++cp) {
1419+
String unicodeDataValue =
1420+
indexUnicodeProperties
1421+
.getProperty(UcdProperty.Non_Unihan_Numeric_Value)
1422+
.getValue(cp);
1423+
var range = new IntRange();
1424+
range.start = cp;
1425+
range.end = cp;
1426+
if (unicodeDataValue == null) {
1427+
if (!extractedValue.endsWith(".0")) {
1428+
throw new IllegalArgumentException(
1429+
"Non-integer numeric value extracted from Unihan for "
1430+
+ Utility.hex(cp)
1431+
+ ": "
1432+
+ extractedValue);
1433+
}
1434+
propInfo.put(
1435+
data,
1436+
line.getMissingSet(),
1437+
range,
1438+
extractedValue.substring(0, extractedValue.length() - 2),
1439+
null,
1440+
false,
1441+
nextVersion);
1442+
} else {
1443+
// Prior to Unicode 5.1, DerivedNumericValues.txt is useless for getting
1444+
// numeric values whose denominator is not a small power of two, as it
1445+
// only provides field 1, which is decimal with *mystery rounding* (in
1446+
// particular, not enough digits to disambiguate between binary32
1447+
// values).
1448+
// It is not normative either, so we use the value from UnicodeData.
1449+
// We use the values from DerivedNumericValues.txt when they are
1450+
// extracted from Unihan, as this avoids having to reconstruct old
1451+
// derivations here. In particular, Unihan numeric properties do *not*
1452+
// feed into the Numeric_Value until 4.0; see
1453+
// https://www.unicode.org/L2/L2003/03039.htm#94-C4.
1454+
propInfo.put(
1455+
data,
1456+
line.getMissingSet(),
1457+
range,
1458+
unicodeDataValue,
1459+
null,
1460+
false,
1461+
nextVersion);
1462+
}
1463+
}
1464+
continue;
13981465
} else if (line.getParts().length != 2
13991466
&& version.compareTo(VersionInfo.UNICODE_3_0_1) > 0) {
14001467
// Unicode 3.0 and earlier had name comments as an extra field.
@@ -1410,6 +1477,22 @@ private static void parseSimpleFieldFile(
14101477
false,
14111478
nextVersion);
14121479
} else {
1480+
if (propInfo.property == UcdProperty.Numeric_Value
1481+
&& line.getParts().length == 3
1482+
&& line.getParts()[1].isEmpty()
1483+
&& line.getParts()[2].equals("NaN")) {
1484+
// 5.1..6.1 have an improper line
1485+
// # @missing: 0000..10FFFF; ; NaN
1486+
// compare 6.2 and 6.3
1487+
// # @missing: 0000..10FFFF; NaN; ; NaN
1488+
// This causes the default for field 1 (which we use as the key for
1489+
// Numeric_Value, with some subsequent chicanery to actually get the data from
1490+
// UnicodeData) to be the empty string, rather than NaN.
1491+
// Before 5.1, there is no @missing line. After 6.3, the @missing line is in
1492+
// PropertyValueAliases, where it is independent of the format of the file
1493+
// specifying the property.
1494+
line.getParts()[1] = "NaN";
1495+
}
14131496
setPropDefault(
14141497
propInfo.property,
14151498
line.getParts()[1],

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ public enum UcdProperty {
125125
"Name_Alias"),
126126
Named_Sequences(PropertyType.Miscellaneous, DerivedPropertyStatus.UCDNonProperty, "NS"),
127127
Named_Sequences_Prov(PropertyType.Miscellaneous, DerivedPropertyStatus.UCDNonProperty, "NSP"),
128+
Non_Unihan_Numeric_Value(
129+
PropertyType.Miscellaneous,
130+
DerivedPropertyStatus.UCDNonProperty,
131+
"Non_Unihan_Numeric_Value"),
128132
Standardized_Variant(
129133
PropertyType.Miscellaneous,
130134
DerivedPropertyStatus.UCDNonProperty,

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,6 +1909,7 @@ public static NFKD_Quick_Check_Values forName(String name) {
19091909
}
19101910
}
19111911

1912+
// Non_Unihan_Numeric_Value
19121913
// normalization_correction_corrected
19131914
// normalization_correction_original
19141915
// normalization_correction_version

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,7 @@ kNSHU_Reading ; kNSHU_Reading ; kReading ; UCDNonProperty
198198
kEH_Func ; kEH_Func ; Provisional
199199
kEH_FVal ; kEH_FVal ; Provisional
200200
kEH_UniK ; kEH_UniK ; Provisional
201+
202+
# Contributory non-property matching exactly field 8 of UnicodeData.txt.
203+
# Mostly useful as a helper to diachronically parse Numeric_Value.
204+
Non_Unihan_Numeric_Value ; Non_Unihan_Numeric_Value ; UCDNonProperty

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ FileType ; NamedSequencesProv ; NamedSequences
7373
# Field1 : property name
7474
# Field2 : field number in file (default is 1)
7575
# Field3 : special handling
76-
# Rational : Value is a rational number
7776
# Skip1FT : Skip line if field 1 is F or T
7877
# Skip1ST : Skip line if field 1 is S or T
7978
# SkipAny4 : Skip line if field 4 is not empty
@@ -86,10 +85,16 @@ UnicodeData; General_Category ; 2
8685
UnicodeData; Canonical_Combining_Class ; 3
8786
DerivedBidiClass; Bidi_Class ; 1
8887
DerivedDecompositionType; Decomposition_Type ; 1
89-
UnicodeData; Decomposition_Mapping ; 5
90-
UnicodeData; Bidi_Mirrored; 9
91-
DerivedNumericValues; Numeric_Value ; 3 ; Rational
88+
UnicodeData; Decomposition_Mapping ; 5
89+
# Handle the lack of rational Numeric_Value in older DerivedNumericValues by
90+
# reading it from UnicodeData via an unofficial contributory property.
91+
UnicodeData; Non_Unihan_Numeric_Value ; 8
92+
DerivedNumericValues; Numeric_Value ; 1
93+
# Prior to 3.1, there is no DerivedNumericValues.txt. But prior to 4.0, Unihan does not contribute
94+
# to Numeric_Value, so we can straightforwardly read Numeric_Value off of UnicodeData.
95+
UnicodeData; Numeric_Value ; 8 ; v3.0.1
9296
DerivedNumericType; Numeric_Type
97+
UnicodeData; Bidi_Mirrored; 9
9398
UnicodeData; Simple_Uppercase_Mapping ; 12
9499
UnicodeData; Simple_Lowercase_Mapping ; 13
95100
UnicodeData; Simple_Titlecase_Mapping ; 14

0 commit comments

Comments
 (0)