Skip to content

Commit f78603c

Browse files
authored
Parse MathClass and MathClassEx (#1213)
1 parent ab9f9af commit f78603c

File tree

11 files changed

+406
-9
lines changed

11 files changed

+406
-9
lines changed

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,18 @@ static void parseSourceFile(
666666
&& (propInfo = propInfoSet.iterator().next()).special
667667
== SpecialProperty.None
668668
&& propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) == 1) {
669+
if (fileName.equals("math/*/MathClass")
670+
&& indexUnicodeProperties.ucdVersion.compareTo(
671+
VersionInfo.UNICODE_6_3)
672+
<= 0) {
673+
parser =
674+
parser.withLinePreprocessor(
675+
s ->
676+
s.startsWith("1D455=210E;")
677+
|| s.equals("code point;class")
678+
? "#" + s
679+
: s);
680+
}
669681
parseSimpleFieldFile(
670682
parser.withMissing(true),
671683
propInfo,
@@ -674,6 +686,23 @@ static void parseSourceFile(
674686
? null
675687
: nextProperties.getProperty(propInfo.property));
676688
} else {
689+
if (fileName.equals("math/*/MathClassEx")
690+
&& indexUnicodeProperties.ucdVersion.compareTo(
691+
VersionInfo.UNICODE_6_3)
692+
<= 0) {
693+
// Old versions of MathClassEx had a malformed range and a line that
694+
// should have been commented out. Search for those specifically and
695+
// fix them; we don’t want to generally allow a new range syntax.
696+
parser =
697+
parser.withLinePreprocessor(
698+
s ->
699+
s.startsWith("FE61-FE68;")
700+
? s.replaceFirst(
701+
"FE61-FE68;", "FE61..FE68;")
702+
: s.startsWith("1D455=210E;")
703+
? "#" + s
704+
: s);
705+
}
677706
parseFieldFile(
678707
parser.withMissing(true),
679708
indexUnicodeProperties,
@@ -1510,6 +1539,27 @@ private static void parseFields(
15101539
value = "No";
15111540
}
15121541
}
1542+
if ((propInfo.property == UcdProperty.Math_Entity_Name
1543+
|| propInfo.property == UcdProperty.Math_Entity_Set
1544+
|| propInfo.property == UcdProperty.Math_Class_Ex)
1545+
&& indexUnicodeProperties.ucdVersion.compareTo(Utility.UTR25_REVISION_16)
1546+
< 0) {
1547+
merger = new PropertyUtilities.RedundancyIgnoringMultivaluedJoiner();
1548+
}
1549+
if (propInfo.property == UcdProperty.Math_Descriptive_Comments
1550+
&& indexUnicodeProperties.ucdVersion.compareTo(Utility.UTR25_REVISION_16)
1551+
< 0) {
1552+
merger = new PropertyUtilities.NullIgnorer();
1553+
}
1554+
if (propInfo.property == UcdProperty.Math_Class_Ex
1555+
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_6_1) < 0
1556+
&& value.isEmpty()) {
1557+
// MathClassEx-12 has
1558+
// 27CA;;;;;;VERTICAL BAR WITH HORIZONTAL STROKE
1559+
// MathClassEx-11 has
1560+
// 21EA..21F3;;⇪..⇳;;;; 21EA-21F3 are keyboard
1561+
value = "None";
1562+
}
15131563
propInfo.put(
15141564
data,
15151565
line.getMissingSet(),
@@ -1569,6 +1619,7 @@ private static void parseSimpleFieldFile(
15691619
propInfo.property, defaultValue, "hardcoded", false, version);
15701620
}
15711621
}
1622+
Merge<String> merger = null;
15721623
if (line.getParts().length == 3 && propInfo.property == UcdProperty.Block) {
15731624
// The old Blocks files had First; Last; Block.
15741625
IntRange range = new IntRange();
@@ -1646,6 +1697,13 @@ private static void parseSimpleFieldFile(
16461697
}
16471698
}
16481699
continue;
1700+
} else if (propInfo.property == UcdProperty.Math_Class
1701+
&& version.compareTo(VersionInfo.UNICODE_6_0) < 0) {
1702+
merger = new PropertyUtilities.RedundancyIgnoringMultivaluedJoiner();
1703+
// MathClass-11 had a line without a value, 21EA..21F3;
1704+
if (line.getParts()[1].isEmpty()) {
1705+
line.getParts()[1] = "None";
1706+
}
16491707
} else if (line.getParts().length != 2
16501708
&& version.compareTo(VersionInfo.UNICODE_3_0_1) > 0) {
16511709
// Unicode 3.0 and earlier had name comments as an extra field.
@@ -1657,7 +1715,7 @@ private static void parseSimpleFieldFile(
16571715
line.getMissingSet(),
16581716
line.getRange(),
16591717
line.getParts()[1],
1660-
null,
1718+
merger,
16611719
false,
16621720
nextVersion);
16631721
} else {

unicodetools/src/main/java/org/unicode/props/PropertyUtilities.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
package org.unicode.props;
22

3+
import com.google.common.base.Objects;
34
import com.ibm.icu.impl.UnicodeMap;
45
import com.ibm.icu.text.UnicodeSet;
56
import java.util.Collection;
67
import java.util.Map;
8+
import java.util.Set;
79
import org.unicode.text.utility.Utility;
810

911
public class PropertyUtilities {
@@ -37,6 +39,51 @@ public String merge(String first, String second) {
3739
}
3840
}
3941

42+
public static final class NullIgnorer implements Merge<String> {
43+
public NullIgnorer() {}
44+
45+
@Override
46+
public String merge(String first, String second) {
47+
if (second == null) {
48+
return first;
49+
} else {
50+
throw new UnicodePropertyException(
51+
"Key already present in UnicodeMap:\told: " + first + ",\tnew: " + second);
52+
}
53+
}
54+
}
55+
56+
public static final class RedundancyIgnorer implements Merge<String> {
57+
public RedundancyIgnorer() {}
58+
59+
@Override
60+
public String merge(String first, String second) {
61+
if (Objects.equal(first, second)) {
62+
return first;
63+
} else {
64+
throw new UnicodePropertyException(
65+
"Key already present in UnicodeMap:\told: " + first + ",\tnew: " + second);
66+
}
67+
}
68+
}
69+
70+
public static final class RedundancyIgnoringMultivaluedJoiner implements Merge<String> {
71+
public RedundancyIgnoringMultivaluedJoiner() {}
72+
73+
@Override
74+
public String merge(String first, String second) {
75+
if (first == null) {
76+
return second;
77+
}
78+
final Set<String> oldValues = Set.of(first.split("\\|"));
79+
if (second == null || oldValues.contains(second)) {
80+
return first;
81+
} else {
82+
return first + "|" + second;
83+
}
84+
}
85+
}
86+
4087
static final <K, V, M extends Map<K, V>> M putNew(M map, K key, V value) {
4188
final V oldValue = map.get(key);
4289
if (oldValue != null) {

unicodetools/src/main/java/org/unicode/props/UcdLineParser.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import java.util.ArrayList;
55
import java.util.Iterator;
66
import java.util.NoSuchElementException;
7+
import java.util.function.Function;
78
import java.util.regex.Matcher;
89
import java.util.regex.Pattern;
910
import org.unicode.cldr.util.RegexUtilities;
@@ -85,18 +86,21 @@ public enum Contents {
8586
private final ArrayList<String> partsList = new ArrayList<>();
8687
private String[] parts = null;
8788
private final IntRange intRange = new IntRange();
89+
private final Function<String, String> linePreprocessor;
8890

8991
UcdLine(
9092
Pattern splitPattern,
9193
boolean withRange,
9294
boolean withMissing,
9395
Iterator<String> rawLines,
94-
UcdFileStats stats) {
96+
UcdFileStats stats,
97+
Function<String, String> linePreprocessor) {
9598
splitter = splitPattern.matcher("");
9699
this.withRange = withRange;
97100
this.withMissing = withMissing;
98101
this.rawLines = rawLines;
99102
this.stats = stats;
103+
this.linePreprocessor = linePreprocessor;
100104
}
101105

102106
@Override
@@ -117,6 +121,9 @@ public boolean hasNext() {
117121
|| line.startsWith(">>>>>>>")) {
118122
line2 = "";
119123
}
124+
if (linePreprocessor != null) {
125+
line2 = linePreprocessor.apply(line2);
126+
}
120127
++stats.lineCount;
121128
final int hashPos = line2.indexOf('#');
122129
if (hashPos >= 0) {
@@ -223,6 +230,7 @@ public UnicodeSet getMissingSet() {
223230
private boolean withTabs = false;
224231
private boolean withRange = true;
225232
private boolean withMissing = false;
233+
private Function<String, String> linePreprocessor;
226234
private final Iterable<String> rawLines;
227235
private final UcdFileStats stats = new UcdFileStats();
228236

@@ -245,10 +253,25 @@ public UcdLineParser withMissing(boolean m) {
245253
return this;
246254
}
247255

256+
// Sets a line preprocessor to which the line is fed before removing comments,
257+
// splitting fields, and decoding ranges.
258+
// This makes it possible to correct lines with ill-formed ranges.
259+
// For corrections affecting only subsequent fields rather than the range,
260+
// prefer handling in the parse* functions in PropertyParsingInfo.
261+
public UcdLineParser withLinePreprocessor(Function<String, String> f) {
262+
linePreprocessor = f;
263+
return this;
264+
}
265+
248266
@Override
249267
public Iterator<UcdLine> iterator() {
250268
return new UcdLine(
251-
withTabs ? TAB : SEMICOLON, withRange, withMissing, rawLines.iterator(), stats);
269+
withTabs ? TAB : SEMICOLON,
270+
withRange,
271+
withMissing,
272+
rawLines.iterator(),
273+
stats,
274+
linePreprocessor);
252275
}
253276

254277
public int getLineCount() {

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
import org.unicode.props.UcdPropertyValues.Joining_Group_Values;
2727
import org.unicode.props.UcdPropertyValues.Joining_Type_Values;
2828
import org.unicode.props.UcdPropertyValues.Line_Break_Values;
29+
import org.unicode.props.UcdPropertyValues.Math_Class_Ex_Values;
30+
import org.unicode.props.UcdPropertyValues.Math_Class_Values;
2931
import org.unicode.props.UcdPropertyValues.NFC_Quick_Check_Values;
3032
import org.unicode.props.UcdPropertyValues.NFD_Quick_Check_Values;
3133
import org.unicode.props.UcdPropertyValues.NFKC_Quick_Check_Values;
@@ -124,6 +126,22 @@ public enum UcdProperty {
124126
Emoji_SB(PropertyType.Miscellaneous, DerivedPropertyStatus.UCDNonProperty, "ESB"),
125127
ISO_Comment(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "isc"),
126128
Jamo_Short_Name(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "JSN"),
129+
Math_Descriptive_Comments(
130+
PropertyType.Miscellaneous,
131+
DerivedPropertyStatus.NonUCDNonProperty,
132+
"Math_Descriptive_Comments"),
133+
Math_Entity_Name(
134+
PropertyType.Miscellaneous,
135+
DerivedPropertyStatus.NonUCDNonProperty,
136+
null,
137+
ValueCardinality.Unordered,
138+
"Math_Entity_Name"),
139+
Math_Entity_Set(
140+
PropertyType.Miscellaneous,
141+
DerivedPropertyStatus.NonUCDNonProperty,
142+
null,
143+
ValueCardinality.Unordered,
144+
"Math_Entity_Set"),
127145
Name(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "na"),
128146
Name_Alias(
129147
PropertyType.Miscellaneous,
@@ -713,6 +731,18 @@ public enum UcdProperty {
713731
Line_Break_Values.class,
714732
null,
715733
"lb"),
734+
Math_Class(
735+
PropertyType.Enumerated,
736+
DerivedPropertyStatus.NonUCDProperty,
737+
Math_Class_Values.class,
738+
ValueCardinality.Ordered,
739+
"Math_Class"),
740+
Math_Class_Ex(
741+
PropertyType.Enumerated,
742+
DerivedPropertyStatus.NonUCDNonProperty,
743+
Math_Class_Ex_Values.class,
744+
ValueCardinality.Ordered,
745+
"Math_Class_Ex"),
716746
NFC_Quick_Check(
717747
PropertyType.Enumerated,
718748
DerivedPropertyStatus.Approved,

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1817,6 +1817,96 @@ public static Line_Break_Values forName(String name) {
18171817
}
18181818

18191819
// Lowercase_Mapping
1820+
public enum Math_Class_Values implements Named {
1821+
None("None"),
1822+
Normal("N"),
1823+
Alphabetic("A"),
1824+
Binary("B"),
1825+
Closing("C"),
1826+
Diacritic("D"),
1827+
Fence("F"),
1828+
Glyph_Part("G"),
1829+
Invisible("I"),
1830+
Large("L"),
1831+
Opening("O"),
1832+
Punctuation("P"),
1833+
Relation("R", "R?"),
1834+
Space("S"),
1835+
Unary("U"),
1836+
Vary("V"),
1837+
Special("X");
1838+
private final PropertyNames<Math_Class_Values> names;
1839+
1840+
private Math_Class_Values(String shortName, String... otherNames) {
1841+
names =
1842+
new PropertyNames<Math_Class_Values>(
1843+
Math_Class_Values.class, this, shortName, otherNames);
1844+
}
1845+
1846+
@Override
1847+
public PropertyNames<Math_Class_Values> getNames() {
1848+
return names;
1849+
}
1850+
1851+
@Override
1852+
public String getShortName() {
1853+
return names.getShortName();
1854+
}
1855+
1856+
private static final NameMatcher<Math_Class_Values> NAME_MATCHER =
1857+
PropertyNames.getNameToEnums(Math_Class_Values.class);
1858+
1859+
public static Math_Class_Values forName(String name) {
1860+
return NAME_MATCHER.get(name);
1861+
}
1862+
}
1863+
1864+
public enum Math_Class_Ex_Values implements Named {
1865+
None("None"),
1866+
Normal("N"),
1867+
Alphabetic("A"),
1868+
Binary("B"),
1869+
Closing("C"),
1870+
Diacritic("D"),
1871+
Fence("F"),
1872+
Glyph_Part("G"),
1873+
Large("L"),
1874+
Opening("O"),
1875+
Punctuation("P"),
1876+
Relation("R", "R?"),
1877+
Space("S"),
1878+
Unary("U"),
1879+
Vary("V"),
1880+
Special("X");
1881+
private final PropertyNames<Math_Class_Ex_Values> names;
1882+
1883+
private Math_Class_Ex_Values(String shortName, String... otherNames) {
1884+
names =
1885+
new PropertyNames<Math_Class_Ex_Values>(
1886+
Math_Class_Ex_Values.class, this, shortName, otherNames);
1887+
}
1888+
1889+
@Override
1890+
public PropertyNames<Math_Class_Ex_Values> getNames() {
1891+
return names;
1892+
}
1893+
1894+
@Override
1895+
public String getShortName() {
1896+
return names.getShortName();
1897+
}
1898+
1899+
private static final NameMatcher<Math_Class_Ex_Values> NAME_MATCHER =
1900+
PropertyNames.getNameToEnums(Math_Class_Ex_Values.class);
1901+
1902+
public static Math_Class_Ex_Values forName(String name) {
1903+
return NAME_MATCHER.get(name);
1904+
}
1905+
}
1906+
1907+
// Math_Descriptive_Comments
1908+
// Math_Entity_Name
1909+
// Math_Entity_Set
18201910
// Name
18211911
// Name_Alias
18221912
// Named_Sequences

0 commit comments

Comments
 (0)