Skip to content

Commit 8dcaecc

Browse files
committed
NamesList in IUP
1 parent 5118852 commit 8dcaecc

File tree

6 files changed

+174
-0
lines changed

6 files changed

+174
-0
lines changed

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,7 @@ enum FileType {
490490
NameAliases,
491491
StandardizedVariants,
492492
Confusables,
493+
NamesList,
493494
}
494495

495496
static Map<String, FileType> file2Type = new HashMap<String, FileType>();
@@ -616,6 +617,13 @@ static void parseSourceFile(
616617
parseUnicodeDataFile(
617618
parser, indexUnicodeProperties, nextProperties, propInfoSet);
618619
break;
620+
case NamesList:
621+
parseNamesListFile(
622+
FileUtilities.in("", fullFilename),
623+
indexUnicodeProperties,
624+
nextProperties,
625+
propInfoSet);
626+
break;
619627
case Field:
620628
if (propInfoSet.size() == 1
621629
&& (propInfo = propInfoSet.iterator().next()).special
@@ -754,6 +762,122 @@ static void parseSourceFile(
754762
}
755763
}
756764

765+
private static void parseNamesListFile(
766+
Iterable<String> lines,
767+
IndexUnicodeProperties indexUnicodeProperties,
768+
IndexUnicodeProperties nextProperties,
769+
Set<PropertyParsingInfo> propInfoSet) {
770+
final var namesListChar = Pattern.compile("[0-9A-F]{4,6}");
771+
final var subheaderPropInfo = property2PropertyInfo.get(UcdProperty.Names_List_Subheader);
772+
final var nextSubheader =
773+
nextProperties == null
774+
? null
775+
: nextProperties.getProperty(UcdProperty.Names_List_Subheader);
776+
final UnicodeMap<String> subheaderData =
777+
indexUnicodeProperties.property2UnicodeMap.get(UcdProperty.Names_List_Subheader);
778+
final var subheaderNoticePropInfo =
779+
property2PropertyInfo.get(UcdProperty.Names_List_Subheader_Notice);
780+
final var nextSubheaderNotice =
781+
nextProperties == null
782+
? null
783+
: nextProperties.getProperty(UcdProperty.Names_List_Subheader_Notice);
784+
final UnicodeMap<String> subheaderNoticeData =
785+
indexUnicodeProperties.property2UnicodeMap.get(
786+
UcdProperty.Names_List_Subheader_Notice);
787+
final var crossReferencePropInfo =
788+
property2PropertyInfo.get(UcdProperty.Names_List_Cross_Ref);
789+
final var nextCrossReference =
790+
nextProperties == null
791+
? null
792+
: nextProperties.getProperty(UcdProperty.Names_List_Cross_Ref);
793+
final UnicodeMap<String> crossReferenceData =
794+
indexUnicodeProperties.property2UnicodeMap.get(UcdProperty.Names_List_Cross_Ref);
795+
final var commentPropInfo = property2PropertyInfo.get(UcdProperty.Names_List_Comment);
796+
final var nextComment =
797+
nextProperties == null
798+
? null
799+
: nextProperties.getProperty(UcdProperty.Names_List_Comment);
800+
final UnicodeMap<String> commentData =
801+
indexUnicodeProperties.property2UnicodeMap.get(UcdProperty.Names_List_Comment);
802+
final var aliasPropInfo = property2PropertyInfo.get(UcdProperty.Names_List_Alias);
803+
final var nextAlias =
804+
nextProperties == null
805+
? null
806+
: nextProperties.getProperty(UcdProperty.Names_List_Alias);
807+
final UnicodeMap<String> aliasData =
808+
indexUnicodeProperties.property2UnicodeMap.get(UcdProperty.Names_List_Alias);
809+
810+
aliasPropInfo.multivaluedSplit = NO_SPLIT;
811+
commentPropInfo.multivaluedSplit = NO_SPLIT;
812+
813+
String subheader = null;
814+
String subheaderNotice = null;
815+
IntRange codePoint = null;
816+
for (String line : lines) {
817+
String[] parts = line.split("\t+");
818+
if (parts.length == 2 && namesListChar.matcher(parts[0]).matches()) {
819+
codePoint = new IntRange();
820+
codePoint.set(parts[0]);
821+
if (subheader != null) {
822+
subheaderPropInfo.put(subheaderData, codePoint, subheader, nextSubheader);
823+
}
824+
if (subheaderNotice != null) {
825+
subheaderNoticePropInfo.put(
826+
subheaderNoticeData, codePoint, subheaderNotice, nextSubheaderNotice);
827+
}
828+
} else if (codePoint != null
829+
&& parts.length == 2
830+
&& (parts[0].isEmpty()
831+
|| (parts[0].equals("@+") && parts[1].startsWith("* ")))) {
832+
if (parts[1].startsWith("x ")) {
833+
String crossReference;
834+
if (parts[1].charAt(2) == '(') {
835+
crossReference = parts[1].split(" \\- |\\)")[1];
836+
} else {
837+
crossReference = parts[1].split(" ")[1];
838+
}
839+
crossReferencePropInfo.put(
840+
crossReferenceData,
841+
codePoint,
842+
crossReference,
843+
IndexUnicodeProperties.MULTIVALUED_JOINER,
844+
nextCrossReference);
845+
} else if (parts[1].startsWith("* ")) {
846+
commentPropInfo.put(
847+
commentData,
848+
codePoint,
849+
parts[1].substring(2),
850+
IndexUnicodeProperties.MULTIVALUED_JOINER,
851+
nextComment);
852+
} else if (parts[1].startsWith("= ")) {
853+
aliasPropInfo.put(
854+
aliasData,
855+
codePoint,
856+
parts[1].substring(2),
857+
IndexUnicodeProperties.MULTIVALUED_JOINER,
858+
nextAlias);
859+
}
860+
}
861+
if (parts.length == 2 && parts[0].equals("@")) {
862+
subheader = parts[1];
863+
subheaderNotice = null;
864+
codePoint = null;
865+
}
866+
if (parts.length == 4 && parts[0].equals("@@")) {
867+
// New block header, clear the current subheader.
868+
subheader = null;
869+
subheaderNotice = null;
870+
codePoint = null;
871+
}
872+
if (parts.length == 2
873+
&& parts[0].equals("@+")
874+
&& codePoint == null
875+
&& subheader != null) {
876+
subheaderNotice = parts[1];
877+
}
878+
}
879+
}
880+
757881
private static void parseCJKRadicalsFile(
758882
UcdLineParser parser,
759883
PropertyParsingInfo propInfo,

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ public enum UcdProperty {
7474
Lowercase_Mapping(PropertyType.String, DerivedPropertyStatus.Approved, "lc"),
7575
NFKC_Casefold(PropertyType.String, DerivedPropertyStatus.Approved, "NFKC_CF"),
7676
NFKC_Simple_Casefold(PropertyType.String, DerivedPropertyStatus.Approved, "NFKC_SCF"),
77+
Names_List_Cross_Ref(
78+
PropertyType.String,
79+
DerivedPropertyStatus.UCDNonProperty,
80+
null,
81+
ValueCardinality.Unordered,
82+
"Names_List_Cross_Ref"),
7783
Simple_Case_Folding(PropertyType.String, DerivedPropertyStatus.Approved, "scf", "sfc"),
7884
Simple_Lowercase_Mapping(PropertyType.String, DerivedPropertyStatus.Approved, "slc"),
7985
Simple_Titlecase_Mapping(PropertyType.String, DerivedPropertyStatus.Approved, "stc"),
@@ -125,6 +131,27 @@ public enum UcdProperty {
125131
"Name_Alias"),
126132
Named_Sequences(PropertyType.Miscellaneous, DerivedPropertyStatus.UCDNonProperty, "NS"),
127133
Named_Sequences_Prov(PropertyType.Miscellaneous, DerivedPropertyStatus.UCDNonProperty, "NSP"),
134+
Names_List_Alias(
135+
PropertyType.Miscellaneous,
136+
DerivedPropertyStatus.UCDNonProperty,
137+
null,
138+
ValueCardinality.Unordered,
139+
"Names_List_Alias"),
140+
Names_List_Comment(
141+
PropertyType.Miscellaneous,
142+
DerivedPropertyStatus.UCDNonProperty,
143+
null,
144+
ValueCardinality.Unordered,
145+
"Names_List_Comment"),
146+
Names_List_Subheader(
147+
PropertyType.Miscellaneous,
148+
DerivedPropertyStatus.UCDNonProperty,
149+
"Names_List_Subheader",
150+
"subhead"),
151+
Names_List_Subheader_Notice(
152+
PropertyType.Miscellaneous,
153+
DerivedPropertyStatus.UCDNonProperty,
154+
"Names_List_Subheader_Notice"),
128155
Non_Unihan_Numeric_Value(
129156
PropertyType.Miscellaneous,
130157
DerivedPropertyStatus.UCDNonProperty,

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,6 +1789,11 @@ public static Line_Break_Values forName(String name) {
17891789
// Name_Alias
17901790
// Named_Sequences
17911791
// Named_Sequences_Prov
1792+
// Names_List_Alias
1793+
// Names_List_Comment
1794+
// Names_List_Cross_Ref
1795+
// Names_List_Subheader
1796+
// Names_List_Subheader_Notice
17921797
public enum NFC_Quick_Check_Values implements Named {
17931798
Maybe("M"),
17941799
No("N"),

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ normalization_correction_corrected ; normalization_correction_corrected ; UCDNon
5757

5858
kEH_AltSeq ; kEH_AltSeq ; Provisional
5959

60+
Names_List_Cross_Ref ; Names_List_Cross_Ref ; UCDNonProperty
61+
6062
# ================================================
6163
# Miscellaneous Properties
6264
# ================================================
@@ -202,3 +204,8 @@ kEH_UniK ; kEH_UniK ; Provisional
202204
# Contributory non-property matching exactly field 8 of UnicodeData.txt.
203205
# Mostly useful as a helper to diachronically parse Numeric_Value.
204206
Non_Unihan_Numeric_Value ; Non_Unihan_Numeric_Value ; UCDNonProperty
207+
208+
Names_List_Subheader ; Names_List_Subheader ; subhead ; UCDNonProperty
209+
Names_List_Subheader_Notice ; Names_List_Subheader_Notice ; UCDNonProperty
210+
Names_List_Alias ; Names_List_Alias ; UCDNonProperty
211+
Names_List_Comment ; Names_List_Comment ; UCDNonProperty

unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,10 @@ normalization_correction_original ; SINGLE_VALUED ; [0-9A-F]{4,5}
210210
normalization_correction_corrected ; SINGLE_VALUED ; [0-9A-F]{4,5}
211211
normalization_correction_version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9]
212212

213+
Names_List_Alias ; MULTI_VALUED ; .*
214+
Names_List_Comment ; MULTI_VALUED ; .*
215+
Names_List_Cross_Ref ; MULTI_VALUED ; .*
216+
213217
# =============================
214218
# Catalog/Enum/Binary Properties
215219
# All not listed are SINGLE_VALUED ; null

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ FileType ; CJKRadicals ; CJKRadicals
6161
FileType ; NamedSequences ; NamedSequences
6262
FileType ; NamedSequencesProv ; NamedSequences
6363

64+
FileType ; NamesList ; NamesList
6465

6566
# =======================================
6667
# FILES FOR PROPERTIES
@@ -523,3 +524,9 @@ Unikemet ; kEH_NoMirror
523524
Unikemet ; kEH_NoRotate
524525
Unikemet ; kEH_UniK
525526
Unikemet ; kEH_AltSeq
527+
528+
NamesList ; Names_List_Alias
529+
NamesList ; Names_List_Cross_Ref
530+
NamesList ; Names_List_Comment
531+
NamesList ; Names_List_Subheader
532+
NamesList ; Names_List_Subheader_Notice

0 commit comments

Comments
 (0)