Skip to content

Commit 4aa0dc0

Browse files
authored
Version is not age (#1090)
* Do not fall back when looking for 4.1 data or later * spots * Parse all versions, not just the ones that correspond to an age * no toList * no 2.1.0 * Another one * Throw * meow * meow * Nasty bug * Try not to run out of memory. * Is this the accursed 13.1 again? * stray return
1 parent 3d42b4d commit 4aa0dc0

File tree

10 files changed

+152
-47
lines changed

10 files changed

+152
-47
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import com.ibm.icu.impl.Row.R4;
44
import com.ibm.icu.impl.UnicodeMap;
5-
import com.ibm.icu.impl.Utility;
65
import com.ibm.icu.lang.UCharacter;
76
import com.ibm.icu.lang.UProperty;
87
import com.ibm.icu.text.Collator;
@@ -57,6 +56,7 @@
5756
import org.unicode.props.UnicodeProperty;
5857
import org.unicode.props.UnicodeProperty.UnicodeMapProperty;
5958
import org.unicode.text.utility.Settings;
59+
import org.unicode.text.utility.Utility;
6060

6161
// For dependency management, it might be useful to split this omnibus class into
6262
// pieces by topic, such as collation utilities vs. IDNA utilities etc.
@@ -1719,11 +1719,8 @@ class PropertyAssignment {
17191719
// TODO(eggrobin): TUP normalization chokes on sufficiently old versions, but this is not
17201720
// worth debugging as we want to get rid of it.
17211721
if (!propName.startsWith("toNF")) {
1722-
for (var a : Age_Values.values()) {
1723-
if (a == Age_Values.Unassigned) {
1724-
break;
1725-
}
1726-
var version = VersionInfo.getInstance(a.getShortName());
1722+
for (int i = Utility.UNICODE_VERSIONS.size() - 1; i >= 0; --i) {
1723+
final var version = Utility.UNICODE_VERSIONS.get(i);
17271724
if (version.compareTo(minVersion) < 0) {
17281725
continue;
17291726
}

unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
import org.unicode.draft.UnicodeDataOutput.ItemWriter;
4848
import org.unicode.props.PropertyNames.Named;
4949
import org.unicode.props.PropertyUtilities.Merge;
50-
import org.unicode.props.UcdPropertyValues.Age_Values;
5150
import org.unicode.props.UcdPropertyValues.Binary;
5251
import org.unicode.props.UcdPropertyValues.General_Category_Values;
5352
import org.unicode.text.utility.Settings;
@@ -148,19 +147,16 @@ public static synchronized void useIncrementalProperties() {
148147
public static final synchronized IndexUnicodeProperties make(VersionInfo ucdVersion) {
149148
IndexUnicodeProperties newItem = version2IndexUnicodeProperties.get(ucdVersion);
150149
if (newItem == null) {
151-
Age_Values nextAge = Age_Values.Unassigned;
152-
for (int i = 0; i < Age_Values.values().length - 1; ++i) {
153-
final var version = VersionInfo.getInstance(Age_Values.values()[i].getShortName());
154-
if (version.equals(ucdVersion)) {
155-
nextAge = Age_Values.values()[i + 1];
156-
}
150+
if (!Utility.isUnicodeVersion(ucdVersion)) {
151+
throw new IllegalArgumentException("Not a Unicode version: " + ucdVersion);
157152
}
153+
VersionInfo nextVersion = Utility.getVersionFollowing(ucdVersion);
158154
IndexUnicodeProperties base =
159155
!incrementalProperties || ucdVersion == Settings.LAST_VERSION_INFO
160156
? null
161-
: nextAge == Age_Values.Unassigned
157+
: nextVersion == null
162158
? make(Settings.LAST_VERSION_INFO)
163-
: make(nextAge);
159+
: make(nextVersion);
164160
version2IndexUnicodeProperties.put(
165161
ucdVersion, newItem = new IndexUnicodeProperties(ucdVersion, base));
166162
}
@@ -176,8 +172,7 @@ public static final IndexUnicodeProperties make(UcdPropertyValues.Age_Values ucd
176172
}
177173

178174
public static final IndexUnicodeProperties make() {
179-
final Age_Values[] values = Age_Values.values();
180-
return make(values[values.length - 2]);
175+
return make(Settings.LATEST_VERSION_INFO);
181176
}
182177

183178
final VersionInfo ucdVersion;
@@ -927,22 +922,17 @@ public static void loadUcdHistory(
927922
useIncrementalProperties();
928923
System.out.println(
929924
"Loading back to " + (earliest == null ? "the dawn of time" : earliest) + "...");
930-
Age_Values[] ages = Age_Values.values();
931925
final long overallStart = System.currentTimeMillis();
932-
for (int i = ages.length - 2; i >= 0; --i) {
926+
for (int i = 0; i < Utility.UNICODE_VERSIONS.size(); ++i) {
933927
// Load in the order last (released, the base), latest (dev), penultimate,
934928
// antepenultimate, etc.
935-
final var age =
936-
ages[
937-
i == ages.length - 2
938-
? ages.length - 3
939-
: i == ages.length - 3 ? ages.length - 2 : i];
929+
final var version = Utility.UNICODE_VERSIONS.get(i == 0 ? 1 : i == 1 ? 0 : i);
940930
final long ucdStart = System.currentTimeMillis();
941-
System.out.println("Loading UCD " + age.getShortName() + "...");
931+
System.out.println("Loading UCD " + version + "...");
942932
for (boolean unihan : new boolean[] {false, true}) {
943933
final long partStart = System.currentTimeMillis();
944934
final String name = unihan ? "Unihan" : "non-Unihan properties";
945-
final var properties = IndexUnicodeProperties.make(age.getShortName());
935+
final var properties = IndexUnicodeProperties.make(version);
946936
for (UcdProperty property : UcdProperty.values()) {
947937
if (property.getShortName().startsWith("cjk") == unihan) {
948938
properties.load(property, expectCacheHit);
@@ -952,18 +942,17 @@ public static void loadUcdHistory(
952942
"Loaded "
953943
+ name
954944
+ " for "
955-
+ age.getShortName()
945+
+ version
956946
+ " ("
957947
+ (System.currentTimeMillis() - partStart)
958948
+ " ms)");
959949
}
960950
System.out.println(
961951
"Loaded UCD "
962-
+ age.getShortName()
952+
+ version
963953
+ " in "
964954
+ (System.currentTimeMillis() - ucdStart)
965955
+ " ms");
966-
var version = VersionInfo.getInstance(age.getShortName());
967956
if (notifyLoaded != null) {
968957
notifyLoaded.accept(version);
969958
}

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -658,11 +658,34 @@ static void parseSourceFile(
658658
case Simple_Lowercase_Mapping:
659659
case Simple_Titlecase_Mapping:
660660
case Simple_Uppercase_Mapping:
661-
final UcdProperty sourceProp = propInfo.defaultValueType.property;
662-
final UnicodeMap<String> otherMap =
663-
indexUnicodeProperties.load(sourceProp); // recurse
664-
for (final String cp : nullValues) {
665-
data.put(cp, otherMap.get(cp));
661+
final var otherMap =
662+
indexUnicodeProperties.load(propInfo.defaultValueType.property);
663+
final UnicodeProperty otherProperty =
664+
indexUnicodeProperties.getProperty(propInfo.defaultValueType.property);
665+
final UnicodeProperty baseVersionOfThisProperty =
666+
indexUnicodeProperties.baseVersionProperties != null
667+
? indexUnicodeProperties.baseVersionProperties.getProperty(
668+
propInfo.property)
669+
: null;
670+
for (final int cp : nullValues.codePoints()) {
671+
// We cannot simply use the raw map otherMap for otherProperty, as it may
672+
// use the UNCHANGED_IN_BASE_VERSION placeholder.
673+
// If property X is defaulting to property Y, and property Y has the same
674+
// assignment as its next version Y′, that does not mean that X has the same
675+
// assignment as its next version X′. If that happens though, we should use
676+
// UNCHANGED_IN_BASE_VERSION.
677+
if (otherMap.get(cp)
678+
.equals(IndexUnicodeProperties.UNCHANGED_IN_BASE_VERSION)) {
679+
if (Objects.equals(
680+
otherProperty.getValue(cp),
681+
baseVersionOfThisProperty.getValue(cp))) {
682+
data.put(cp, IndexUnicodeProperties.UNCHANGED_IN_BASE_VERSION);
683+
} else {
684+
data.put(cp, otherProperty.getValue(cp));
685+
}
686+
} else {
687+
data.put(cp, otherMap.getValue(cp));
688+
}
666689
}
667690
// propInfo.defaultValueType =
668691
// property2PropertyInfo.get(sourceProp).defaultValueType; // reset to the type
@@ -783,12 +806,26 @@ private static void parsePropertyValueFile(
783806
}
784807
}
785808

809+
if (item == null
810+
&& indexUnicodeProperties.ucdVersion == VersionInfo.UNICODE_3_1_1
811+
&& propName.equals("297")) {
812+
// Missing field 1 for in the record for U+64AC kPhonetic in Unihan 3.1.1.
813+
// See UAX #38:
814+
// The Version 3.1.1 Unihan database file, Unihan-3.1.1.txt, includes the
815+
// following anomalous record at line 246,442: U+64AC 297.
816+
extractedValue = propName;
817+
propName = "kPhonetic";
818+
item = UcdProperty.forString(propName);
819+
}
820+
786821
if (item == null) {
787822
throw new IllegalArgumentException(
788823
"Missing property enum in UcdProperty for "
789824
+ propName
790825
+ "\nSee "
791-
+ NEW_UNICODE_PROPS_DOCS);
826+
+ NEW_UNICODE_PROPS_DOCS
827+
+ ". At:"
828+
+ line.getOriginalLine());
792829
}
793830

794831
PropertyParsingInfo propInfo;
@@ -1025,6 +1062,11 @@ private static void parseNameAliasesFile(
10251062
}
10261063
}
10271064

1065+
static final Set<Integer> BROKEN_UNICODEDATA_LINES_IN_2_1_5 =
1066+
Set.of(
1067+
0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F, 0xFA21, 0xFA23, 0xFA24, 0xFA27,
1068+
0xFA28, 0xFA29);
1069+
10281070
private static void parseUnicodeDataFile(
10291071
UcdLineParser parser,
10301072
IndexUnicodeProperties indexUnicodeProperties,
@@ -1075,6 +1117,15 @@ private static void parseUnicodeDataFile(
10751117
// Decomposition_Mapping: Remove the decomposition type.
10761118
parts[5] = DECOMP_REMOVE.matcher(parts[5]).replaceAll("").trim();
10771119
}
1120+
if (indexUnicodeProperties.ucdVersion == VersionInfo.UNICODE_2_1_5
1121+
&& BROKEN_UNICODEDATA_LINES_IN_2_1_5.contains(line.getRange().start)) {
1122+
// These lines have the form
1123+
// FA0E;CJK COMPATIBILITY IDEOGRAPH-FA0E;Lo;0;L;;;;N;;;;;;
1124+
// Contrast 2.1.8
1125+
// FA0E;CJK COMPATIBILITY IDEOGRAPH-FA0E;Lo;0;L;;;;;N;;;;;
1126+
parts[9] = parts[8];
1127+
parts[8] = "";
1128+
}
10781129
parseFields(
10791130
line, indexUnicodeProperties, nextProperties, propInfoSet, null, hackHangul);
10801131
}
@@ -1138,10 +1189,30 @@ private static void parseFields(
11381189
String value =
11391190
propInfo.fieldNumber >= parts.length ? null : parts[propInfo.fieldNumber];
11401191
if (propInfo.property == UcdProperty.Joining_Group
1141-
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_4_0) <= 0
1192+
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_4_0_1)
1193+
<= 0
11421194
&& value.equals("<no shaping>")) {
11431195
value = "No_Joining_Group";
11441196
}
1197+
if (merger == null
1198+
&& propInfo.property == UcdProperty.Uppercase_Mapping
1199+
&& indexUnicodeProperties.ucdVersion == VersionInfo.UNICODE_2_1_8
1200+
&& line.getRange().start == 0x1F80
1201+
&& line.getRange().end == 0x1F80) {
1202+
// The first version of SpecialCasing.txt, version 2.1.8 has *three* lines for
1203+
// U+1F80:
1204+
// 1F80; 1F80; 1F88; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND
1205+
// YPOGEGRAMMENI
1206+
// 1F80; 1F80; 1F88; 1F08 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND
1207+
// YPOGEGRAMMENI
1208+
// 1F80; 1F80; 1F88; 1F08 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND
1209+
// PROSGEGRAMMENI
1210+
// We let the last one win, as it is less incorrect than the first; in 2.1.9,
1211+
// the line for U+1F80 is:
1212+
// 1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND
1213+
// YPOGEGRAMMENI
1214+
merger = new PropertyUtilities.Overrider();
1215+
}
11451216
propInfo.put(
11461217
data,
11471218
line.getMissingSet(),
@@ -1226,7 +1297,7 @@ private static void parseSimpleFieldFile(
12261297
nextVersion);
12271298
continue;
12281299
} else if (line.getParts().length != 2
1229-
&& version.compareTo(VersionInfo.UNICODE_3_0) > 0) {
1300+
&& version.compareTo(VersionInfo.UNICODE_3_0_1) > 0) {
12301301
// Unicode 3.0 and earlier had name comments as an extra field.
12311302
throw new IllegalArgumentException(
12321303
"Too many fields in " + line.getOriginalLine());

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,8 @@ public enum Canonical_Combining_Class_Values implements Named {
708708
CCC141("141"),
709709
CCC142("142"),
710710
CCC143("143"),
711-
CCC144("144");
711+
CCC144("144"),
712+
CCC145("145");
712713
private final PropertyNames<Canonical_Combining_Class_Values> names;
713714

714715
private Canonical_Combining_Class_Values(String shortName, String... otherNames) {

unicodetools/src/main/java/org/unicode/text/UCD/VersionedSymbolTable.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.unicode.props.UnicodeProperty;
1717
import org.unicode.props.UnicodePropertySymbolTable;
1818
import org.unicode.text.utility.Settings;
19+
import org.unicode.text.utility.Utility;
1920

2021
/**
2122
* This class implements the semantics of property-query as defined in the UnicodeSet specification.
@@ -44,10 +45,7 @@ public static VersionedSymbolTable frozenAt(VersionInfo version) {
4445
var result = new VersionedSymbolTable();
4546
result.requireSuffixForLatest = false;
4647
result.implicitVersion = version;
47-
// TODO(egg): We should have a programmatic “previous version of Unicode”.
48-
// For now this ensures we fail on U-1.
49-
result.previousVersion = VersionInfo.getInstance(0);
50-
result.oldestLoadedUcd = () -> VersionInfo.UNICODE_1_1_0;
48+
result.previousVersion = Utility.getVersionPreceding(version);
5149
return result;
5250
}
5351

unicodetools/src/main/java/org/unicode/text/utility/Utility.java

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,12 @@
3737
import java.util.Comparator;
3838
import java.util.Iterator;
3939
import java.util.LinkedHashSet;
40+
import java.util.List;
4041
import java.util.Locale;
4142
import java.util.Map;
4243
import java.util.Set;
4344
import java.util.TreeSet;
45+
import java.util.stream.Collectors;
4446
import org.unicode.props.UnicodeProperty;
4547
import org.unicode.text.UCD.Default;
4648
import org.unicode.text.UCD.UCD;
@@ -928,6 +930,34 @@ public static String join(long[] array, String divider) {
928930
"1.1.0",
929931
};
930932

933+
public static final List<VersionInfo> UNICODE_VERSIONS =
934+
Arrays.asList(searchPath).stream()
935+
.map(VersionInfo::getInstance)
936+
.collect(Collectors.toList());
937+
938+
public static VersionInfo getVersionPreceding(VersionInfo version) {
939+
for (VersionInfo candidate : UNICODE_VERSIONS) {
940+
if (candidate.compareTo(version) < 0) {
941+
return candidate;
942+
}
943+
}
944+
return null;
945+
}
946+
947+
public static VersionInfo getVersionFollowing(VersionInfo version) {
948+
VersionInfo result = null;
949+
for (VersionInfo candidate : UNICODE_VERSIONS) {
950+
if (candidate.compareTo(version) > 0) {
951+
result = candidate;
952+
}
953+
}
954+
return result;
955+
}
956+
957+
public static boolean isUnicodeVersion(VersionInfo version) {
958+
return UNICODE_VERSIONS.contains(version);
959+
}
960+
931961
/*public static PrintWriter openPrintWriter(String filename) throws IOException {
932962
return openPrintWriter(filename, LATIN1_UNIX);
933963
}
@@ -1452,8 +1482,8 @@ public static String getMostRecentUnicodeDataFile(
14521482
continue;
14531483
}
14541484
if (version != null
1455-
&& version.compareTo(VersionInfo.UNICODE_4_1) >= 0
1456-
&& currentVersion.compareTo(version) < 0) {
1485+
&& currentVersion.compareTo(version) < 0
1486+
&& version.compareTo(VersionInfo.UNICODE_4_1) >= 0) {
14571487
// Do not look at earlier versions if we want Unicode 4.1 data or later.
14581488
// Unicode 4.0.1 is the last version for which unmodified files were not
14591489
// republished.
@@ -1489,6 +1519,14 @@ public static String getMostRecentUnicodeDataFile(
14891519
// TODO: Consider generally switching from using File to using the newer Path.
14901520
result = searchDirectory(path.toFile(), filename, show, fileType);
14911521
if (result != null) {
1522+
if (version != null && currentVersion.compareTo(version) < 0) {
1523+
// Even back when files were not republished, we copy them.
1524+
throw new IllegalStateException(
1525+
"File "
1526+
+ filename
1527+
+ " should be copied to version directory "
1528+
+ version);
1529+
}
14921530
break;
14931531
}
14941532
}

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ ccc; 141; CCC141 ; CCC141
199199
ccc; 142; CCC142 ; CCC142
200200
ccc; 143; CCC143 ; CCC143
201201
ccc; 144; CCC144 ; CCC144
202+
ccc; 145; CCC145 ; CCC145
202203

203204
# Unicode 2 joining groups, named after Unicode 1 names.
204205
# These should probably be made extra aliases of the non-extra values, but the

unicodetools/src/test/java/org/unicode/propstest/TestInvariants.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import com.ibm.icu.impl.Relation;
44
import com.ibm.icu.impl.UnicodeMap;
55
import com.ibm.icu.text.UnicodeSet;
6+
import com.ibm.icu.util.VersionInfo;
67
import java.util.ArrayList;
78
import java.util.Arrays;
89
import java.util.EnumMap;
@@ -47,7 +48,11 @@ public class TestInvariants extends TestFmwkMinusMinus {
4748
if (age == Age_Values.Unassigned) {
4849
continue;
4950
}
50-
IUPS.put(age, IndexUnicodeProperties.make(age.getShortName()));
51+
IUPS.put(
52+
age,
53+
age == Age_Values.V2_1
54+
? IndexUnicodeProperties.make(VersionInfo.UNICODE_2_1_2)
55+
: IndexUnicodeProperties.make(age.getShortName()));
5156
}
5257
}
5358

0 commit comments

Comments
 (0)