Skip to content

Commit 4739df1

Browse files
authored
The real 1.1.5 (#1098)
* Do not fall back when looking for 4.1 data or later * spots * Parse all versions, not just the ones that correspond to an age * no toList * no 2.1.0 * Another one * Throw * meow * meow * Nasty bug * Try not to run out of memory. * Is this the accursed 13.1 again? * stray return * blarg * Works better than eggspected * noncharacter * bring back the cleanup * spots * bad regex * bad limit * Better handling of various kinds of nulls * spots * The Real UnicodeData-1.1.5.txt * Reconstructions * Fix Age derivation * The -Update curse * Read the reconstructed files * Errors in reconstruction * Better parsing of UnicodeData versions 1.x and 2.x. * Another error in reconstruction, cf. https://www.unicode.org/versions/Unicode1.0.0/Notice.pdf * Do not remove Name * Remove incorrect and outdated UTS39 data * Corrected reconstructions from Ken * Correction to Ken’s correction
1 parent 1c15b71 commit 4739df1

File tree

8 files changed

+21241
-320
lines changed

8 files changed

+21241
-320
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,7 +1385,6 @@ public static void showProperties(
13851385
} else {
13861386
name = toHTML.transliterate(name);
13871387
}
1388-
boolean allowed = XIDModifications.isAllowed(cp);
13891388

13901389
String scriptCat = getScriptCat("", cp);
13911390
if (showDevProperties) {
@@ -1415,16 +1414,6 @@ public static void showProperties(
14151414
out.append("<tr><td class='bigCode'>" + hex + "</td></tr>\n");
14161415
out.append("<tr><td class='bigName'>" + name + "</td></tr>\n");
14171416
out.append("<tr><td class='bigName'>" + scriptCat + "</td></tr>\n");
1418-
out.append("<tr><td class='bigName'><i>id:</i> ");
1419-
if (allowed) {
1420-
out.append("<span class='allowed'>allowed</span>");
1421-
} else {
1422-
out.append(
1423-
"<span class='restricted' title='Restricted in identifiers: "
1424-
+ XIDModifications.getType(cp)
1425-
+ "'>restricted</span>");
1426-
}
1427-
out.append("</td></tr>\n");
14281417
StringBuilder confusableString = displayConfusables(cp);
14291418
out.append(
14301419
"<tr><td class='bigName'><span title='Confusable Characters'><i>confuse:</i> </span>");
@@ -1438,7 +1427,7 @@ public static void showProperties(
14381427

14391428
List<String> availableNames = (List<String>) getFactory().getAvailableNames();
14401429
TreeSet<String> sortedProps =
1441-
Builder.with(new TreeSet<String>(col)).addAll(availableNames).remove("Name").get();
1430+
Builder.with(new TreeSet<String>(col)).addAll(availableNames).get();
14421431

14431432
String kRSUnicode = getFactory().getProperty("kRSUnicode").getValue(cp);
14441433
boolean isUnihan = kRSUnicode != null;
@@ -1484,7 +1473,7 @@ public static void showProperties(
14841473
history.equals("assigned") && age != Age_Values.Unassigned
14851474
? VersionInfo.getInstance(age.getShortName())
14861475
: history.equals("full")
1487-
? VersionInfo.getInstance(Age_Values.V1_1.getShortName())
1476+
? Utility.UNICODE_VERSIONS.get(Utility.UNICODE_VERSIONS.size() - 1)
14881477
: Settings.LAST_VERSION_INFO;
14891478
if (minVersion.compareTo(UcdLoader.getOldestLoadedUcd()) < 0) {
14901479
minVersion = UcdLoader.getOldestLoadedUcd();

unicodetools/data/ucd/1.0.0-Update/UnicodeData.txt

Lines changed: 7250 additions & 0 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/1.0.1-Update/UnicodeData.txt

Lines changed: 7272 additions & 0 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/1.1.0-Update/UnicodeData.txt

Lines changed: 6657 additions & 306 deletions
Large diffs are not rendered by default.

unicodetools/src/main/java/org/unicode/jsp/XIDModifications.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import com.ibm.icu.impl.UnicodeMap;
44
import com.ibm.icu.text.UnicodeSet;
55

6+
// TODO(egg): This class seems to be based on an antique version of UTS #39 data, and also to return
7+
// inverted results (allowed for disallowed and vice-versa). Consider deleting now that we just
8+
// show real UTS #39 data in the tools.
69
public class XIDModifications {
710
private static UnicodeMap<String> allowed = new UnicodeMap(); // "[:XID_Continue:]");
811
private static UnicodeMap<String> reasons = new UnicodeMap<String>();

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,51 @@ private static void parseUnicodeDataFile(
11531153
} else if (parts[1].contains("Hangul Syllable")) {
11541154
parts[1] = CONSTRUCTED_NAME;
11551155
hackHangul = true;
1156+
} else if (parts[1].contains("CJK Compatibility Ideograph")) {
1157+
// Unicode 2.0 through 2.1.2 have
1158+
// F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
1159+
// FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
1160+
// and this is replicated in the reconstructed 1.0.0 and 1.0.1 files.
1161+
parts[1] = "CJK COMPATIBILITY IDEOGRAPH-#";
1162+
} else if (parts[1].equals("<CJK IDEOGRAPH REPRESENTATIVE>")) {
1163+
// UnicodeData-1.1.5.txt does not have ranges yet, instead it has a
1164+
// representative that is meant to apply to ranges defined elsewhere.
1165+
// We inject these ranges here.
1166+
parts[1] = "CJK UNIFIED IDEOGRAPH-#";
1167+
// Start is already at 0x4E00, the representative.
1168+
line.getRange().end = 0x9FA5;
1169+
parseFields(
1170+
line,
1171+
indexUnicodeProperties,
1172+
nextProperties,
1173+
propInfoSet,
1174+
null,
1175+
hackHangul);
1176+
line.getRange().start = 0xF900;
1177+
line.getRange().end = 0xFA2D;
1178+
parts[1] = "CJK COMPATIBILITY IDEOGRAPH-#";
1179+
parseFields(
1180+
line,
1181+
indexUnicodeProperties,
1182+
nextProperties,
1183+
propInfoSet,
1184+
null,
1185+
hackHangul);
1186+
// UnicodeData-1.1.5.txt is also missing the PUA, which was defined only in
1187+
// wording. Inject it here while we are doing surgery on the surrounding CJK
1188+
// blocks. Note that the PUA has its modern E000..F8FF in 1.1, see
1189+
// https://www.unicode.org/versions/Unicode1.1.0/ch02.pdf.
1190+
line.getRange().start = 0xE000;
1191+
line.getRange().end = 0xF8FF;
1192+
parts[1] = null;
1193+
parts[2] = "Co";
1194+
parseFields(
1195+
line,
1196+
indexUnicodeProperties,
1197+
nextProperties,
1198+
propInfoSet,
1199+
null,
1200+
hackHangul);
11561201
} else {
11571202
parts[1] = null;
11581203
}

unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2241,6 +2241,15 @@ public String getAge(int codePoint) {
22412241
break;
22422242
}
22432243
if (ucdCache[i].isAllocated(codePoint)) {
2244+
if (i == UCD_Types.AGE11 && !ucdCache[i + 1].isAllocated(codePoint)) {
2245+
// Deallocations in Unicode 2.
2246+
continue;
2247+
}
2248+
return UCD_Names.LONG_AGE[i];
2249+
} else if (i == UCD_Types.AGE11
2250+
&& ((codePoint >= 0xE000 && codePoint <= 0xF8FF)
2251+
|| (codePoint >= 0xF900 && codePoint <= 0xFA2D))) {
2252+
// Private use and CJK compatibility ideographs, not overt in UnicodeData 1.1.5.
22442253
return UCD_Names.LONG_AGE[i];
22452254
}
22462255
}

unicodetools/src/main/java/org/unicode/text/utility/Utility.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,9 @@ public static String join(long[] array, String divider) {
927927
"2.1.5",
928928
"2.1.2",
929929
"2.0.0",
930-
"1.1.0",
930+
"1.1.0", // Really 1.1.5.
931+
"1.0.1", // Reconstructed.
932+
"1.0.0", // Reconstructed.
931933
};
932934

933935
public static final List<VersionInfo> UNICODE_VERSIONS =

0 commit comments

Comments
 (0)