Skip to content

Commit cba8ecb

Browse files
authored
Parse the old dumps (#1091)
* Do not fall back when looking for 4.1 data or later * spots * Parse all versions, not just the ones that correspond to an age * no toList * no 2.1.0 * Another one * Throw * meow * meow * Nasty bug * Try not to run out of memory. * Is this the accursed 13.1 again? * stray return * blarg * Works better than eggspected * noncharacter * bring back the cleanup * spots * bad regex * bad limit
1 parent 4aa0dc0 commit cba8ecb

File tree

3 files changed

+58
-12
lines changed

3 files changed

+58
-12
lines changed

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.unicode.props.PropertyUtilities.Merge;
2828
import org.unicode.props.UcdLineParser.IntRange;
2929
import org.unicode.props.UcdLineParser.UcdLine.Contents;
30+
import org.unicode.props.UcdPropertyValues.Binary;
3031
import org.unicode.text.utility.Settings;
3132
import org.unicode.text.utility.Utility;
3233

@@ -105,7 +106,7 @@ public PropertyParsingInfo(
105106
this.special = special;
106107
}
107108

108-
static final Pattern VERSION = Pattern.compile("v\\d+\\.\\d+");
109+
static final Pattern VERSION = Pattern.compile("v\\d+(\\.\\d+)+");
109110

110111
private static void fromStrings(String... propertyInfo) {
111112
if (propertyInfo.length < 2 || propertyInfo.length > 4) {
@@ -463,7 +464,7 @@ enum FileType {
463464
NamedSequences,
464465
NameAliases,
465466
StandardizedVariants,
466-
Confusables
467+
Confusables,
467468
}
468469

469470
static Map<String, FileType> file2Type = new HashMap<String, FileType>();
@@ -561,11 +562,18 @@ static void parseSourceFile(
561562
propInfoSet);
562563
break;
563564
case PropertyValue:
564-
parsePropertyValueFile(
565-
parser.withMissing(true),
566-
fileName,
567-
indexUnicodeProperties,
568-
nextProperties);
565+
if (fileName.equals("PropList")
566+
&& indexUnicodeProperties.ucdVersion.compareTo(
567+
VersionInfo.UNICODE_3_1_0)
568+
< 0) {
569+
parsePropertyDumpFile(fullFilename, indexUnicodeProperties, nextProperties);
570+
} else {
571+
parsePropertyValueFile(
572+
parser.withMissing(true),
573+
fileName,
574+
indexUnicodeProperties,
575+
nextProperties);
576+
}
569577
break;
570578
case Confusables:
571579
parseConfusablesFile(
@@ -769,6 +777,45 @@ private static void parseNamedSequencesFile(
769777
}
770778
}
771779

780+
private static void parsePropertyDumpFile(
781+
String fullFilename,
782+
IndexUnicodeProperties indexUnicodeProperties,
783+
IndexUnicodeProperties nextProperties) {
784+
final var dumpHeading = Pattern.compile("Property dump for: 0x[0-9A-F]{8} \\(([^()]+)\\)");
785+
final var dataLine =
786+
Pattern.compile("[0-9A-F]{4,6}(\\.\\.[0-9A-F]{4,6} +\\(\\d+ chars\\))?");
787+
PropertyParsingInfo propInfo = null;
788+
for (String line : FileUtilities.in("", fullFilename)) {
789+
final var heading = dumpHeading.matcher(line);
790+
if (heading.matches()) {
791+
String name = heading.group(1);
792+
propInfo = property2PropertyInfo.get(UcdProperty.forString(name));
793+
if (propInfo == null) {
794+
if (name.equals("Not a Character")) {
795+
// Appears in 3.0.1. See also 84-M6 and 84-M7.
796+
propInfo = property2PropertyInfo.get(UcdProperty.Noncharacter_Code_Point);
797+
} else {
798+
System.err.println("Ignoring unknown property in dump: " + name);
799+
}
800+
}
801+
continue;
802+
}
803+
if (propInfo != null && dataLine.matcher(line).matches()) {
804+
var range = new UcdLineParser.IntRange();
805+
range.set(line.split(" ", 2)[0]);
806+
final var data = indexUnicodeProperties.property2UnicodeMap.get(propInfo.property);
807+
propInfo.put(
808+
data,
809+
range,
810+
"Yes",
811+
null,
812+
nextProperties == null
813+
? null
814+
: nextProperties.getProperty(propInfo.property));
815+
}
816+
}
817+
}
818+
772819
private static void parsePropertyValueFile(
773820
UcdLineParser parser,
774821
String filename,

unicodetools/src/main/java/org/unicode/text/utility/Utility.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,11 +1546,6 @@ public static String getMostRecentUnicodeDataFile(
15461546
+ fileType
15471547
+ "'");
15481548
}
1549-
if ((versionString.startsWith("2.") || versionString.startsWith("3.0"))
1550-
&& filename.startsWith("Prop")) {
1551-
// Ignore the property dumps for now, as we do not have parsing logic for them.
1552-
return null;
1553-
}
15541549
return result;
15551550
}
15561551

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,9 @@ SpecialCasing ; Lowercase_Mapping ; 1 ; SkipAny4
129129
SpecialCasing ; Titlecase_Mapping ; 2 ; SkipAny4
130130
SpecialCasing ; Uppercase_Mapping ; 3 ; SkipAny4
131131
DerivedCoreProperties ; Lowercase
132+
PropList ; Lowercase ; v3.0.1
132133
DerivedCoreProperties ; Uppercase
134+
PropList ; Uppercase ; v3.0.1
133135
DerivedCoreProperties ; Cased
134136
DerivedCoreProperties ; Case_Ignorable
135137
DerivedCoreProperties ; Changes_When_Lowercased
@@ -138,13 +140,15 @@ DerivedCoreProperties ; Changes_When_Titlecased
138140
DerivedCoreProperties ; Changes_When_Casefolded
139141
DerivedCoreProperties ; Changes_When_Casemapped
140142
DerivedCoreProperties ; Alphabetic
143+
PropList ; Alphabetic ; v3.0.1
141144
DerivedCoreProperties ; Default_Ignorable_Code_Point
142145
DerivedCoreProperties ; Grapheme_Base
143146
DerivedCoreProperties ; Grapheme_Extend
144147
# Deprecated and made derived in 5.0.
145148
DerivedCoreProperties ; Grapheme_Link
146149
PropList ; Grapheme_Link ; v4.1
147150
DerivedCoreProperties ; Math
151+
PropList ; Math ; v3.0.1
148152
DerivedCoreProperties ; ID_Start
149153
DerivedCoreProperties ; ID_Continue
150154
DerivedCoreProperties ; XID_Start

0 commit comments

Comments
 (0)