Skip to content

Commit b0362cf

Browse files
authored
Parse block headers (#1317)
* Parse block headers * nonuniform unassigned * more eggsceptions * NO_SPLIT
1 parent 84b5056 commit b0362cf

File tree

9 files changed

+123
-18
lines changed

9 files changed

+123
-18
lines changed

unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -922,11 +922,11 @@ protected List _getValueAliases(String valueAlias, List result) {
922922
}
923923
return result;
924924
}
925-
// @Override
926-
// public boolean hasUniformUnassigned() {
927-
// //throw new UnsupportedOperationException();
928-
// return false;
929-
// }
925+
926+
@Override
927+
public boolean hasUniformUnassigned() {
928+
return false;
929+
}
930930
}
931931

932932
{

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,23 @@ private static void parseNamesListFile(
892892
IndexUnicodeProperties nextProperties,
893893
Set<PropertyParsingInfo> propInfoSet) {
894894
final var namesListChar = Pattern.compile("[0-9A-F]{4,6}");
895+
final var blockHeaderPropInfo =
896+
property2PropertyInfo.get(UcdProperty.Names_List_Block_Header);
897+
final var nextBlockHeader =
898+
nextProperties == null
899+
? null
900+
: nextProperties.getProperty(UcdProperty.Names_List_Block_Header);
901+
final UnicodeMap<String> blockHeaderData =
902+
indexUnicodeProperties.property2UnicodeMap.get(UcdProperty.Names_List_Block_Header);
903+
final var blockHeaderNoticePropInfo =
904+
property2PropertyInfo.get(UcdProperty.Names_List_Block_Header_Notice);
905+
final var nextBlockHeaderNotice =
906+
nextProperties == null
907+
? null
908+
: nextProperties.getProperty(UcdProperty.Names_List_Block_Header_Notice);
909+
final UnicodeMap<String> blockHeaderNoticeData =
910+
indexUnicodeProperties.property2UnicodeMap.get(
911+
UcdProperty.Names_List_Block_Header_Notice);
895912
final var subheaderPropInfo = property2PropertyInfo.get(UcdProperty.Names_List_Subheader);
896913
final var nextSubheader =
897914
nextProperties == null
@@ -933,10 +950,14 @@ private static void parseNamesListFile(
933950

934951
aliasPropInfo.multivaluedSplit = NO_SPLIT;
935952
commentPropInfo.multivaluedSplit = NO_SPLIT;
953+
blockHeaderNoticePropInfo.multivaluedSplit = NO_SPLIT;
936954

955+
String blockHeader = null;
956+
String blockHeaderNotice = null;
937957
String subheader = null;
938958
String subheaderNotice = null;
939959
IntRange codePoint = null;
960+
IntRange blockRange = null;
940961
for (String line : lines) {
941962
String[] parts = line.split("\t+");
942963
if (parts.length == 2 && namesListChar.matcher(parts[0]).matches()) {
@@ -1000,16 +1021,31 @@ private static void parseNamesListFile(
10001021
codePoint = null;
10011022
}
10021023
if (parts.length == 4 && parts[0].equals("@@")) {
1003-
// New block header, clear the current subheader.
1024+
blockRange = new IntRange();
1025+
blockRange.set(parts[1] + ".." + parts[3]);
1026+
blockHeaderPropInfo.put(
1027+
blockHeaderData,
1028+
blockRange,
1029+
parts[2],
1030+
nextBlockHeader,
1031+
indexUnicodeProperties.getUcdVersion());
1032+
blockHeaderNotice = null;
10041033
subheader = null;
10051034
subheaderNotice = null;
10061035
codePoint = null;
10071036
}
1008-
if (parts.length == 2
1009-
&& parts[0].equals("@+")
1010-
&& codePoint == null
1011-
&& subheader != null) {
1012-
subheaderNotice = parts[1];
1037+
if (parts.length == 2 && parts[0].equals("@+") && codePoint == null) {
1038+
if (subheader != null) {
1039+
subheaderNotice = parts[1];
1040+
} else if (blockRange != null) {
1041+
blockHeaderNoticePropInfo.put(
1042+
blockHeaderNoticeData,
1043+
blockRange,
1044+
parts[1],
1045+
IndexUnicodeProperties.MULTIVALUED_JOINER,
1046+
nextBlockHeaderNotice,
1047+
indexUnicodeProperties.getUcdVersion());
1048+
}
10131049
}
10141050
}
10151051
}

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,16 @@ public enum UcdProperty {
184184
null,
185185
ValueCardinality.Unordered,
186186
"Names_List_Alias"),
187+
Names_List_Block_Header(
188+
PropertyType.Miscellaneous,
189+
DerivedPropertyStatus.UCDNonProperty,
190+
"Names_List_Block_Header"),
191+
Names_List_Block_Header_Notice(
192+
PropertyType.Miscellaneous,
193+
DerivedPropertyStatus.UCDNonProperty,
194+
null,
195+
ValueCardinality.Unordered,
196+
"Names_List_Block_Header_Notice"),
187197
Names_List_Comment(
188198
PropertyType.Miscellaneous,
189199
DerivedPropertyStatus.UCDNonProperty,

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,6 +2020,8 @@ public static Math_Class_Ex_Values forName(String name) {
20202020
// Named_Sequences
20212021
// Named_Sequences_Prov
20222022
// Names_List_Alias
2023+
// Names_List_Block_Header
2024+
// Names_List_Block_Header_Notice
20232025
// Names_List_Comment
20242026
// Names_List_Cross_Ref
20252027
// Names_List_Subheader

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,10 +242,12 @@ kEH_Func ; kEH_Func ; Provisional
242242
kEH_FVal ; kEH_FVal ; Provisional
243243
kEH_UniK ; kEH_UniK ; Provisional
244244

245-
Names_List_Subheader ; Names_List_Subheader ; subhead ; UCDNonProperty
246-
Names_List_Subheader_Notice ; Names_List_Subheader_Notice ; UCDNonProperty
247-
Names_List_Alias ; Names_List_Alias ; UCDNonProperty
248-
Names_List_Comment ; Names_List_Comment ; UCDNonProperty
245+
Names_List_Block_Header ; Names_List_Block_Header ; UCDNonProperty
246+
Names_List_Block_Header_Notice ; Names_List_Block_Header_Notice ; UCDNonProperty
247+
Names_List_Subheader ; Names_List_Subheader ; subhead ; UCDNonProperty
248+
Names_List_Subheader_Notice ; Names_List_Subheader_Notice ; UCDNonProperty
249+
Names_List_Alias ; Names_List_Alias ; UCDNonProperty
250+
Names_List_Comment ; Names_List_Comment ; UCDNonProperty
249251

250252
Math_Entity_Name ; Math_Entity_Name ; NonUCDNonProperty
251253
Math_Entity_Set ; Math_Entity_Set ; NonUCDNonProperty

unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ Emoji_SB ; SINGLE_VALUED ; [0-9A-F]{4}
8383
emoji_variation_sequence ; SINGLE_VALUED ; text style|emoji style
8484
Named_Sequences ; SINGLE_VALUED ; $name
8585
Named_Sequences_Prov ; SINGLE_VALUED ; $name
86+
Names_List_Block_Header ; SINGLE_VALUED ; .*
87+
Names_List_Block_Header_Notice ; MULTI_VALUED ; .*
8688
Names_List_Alias ; MULTI_VALUED ; .*
8789
Names_List_Comment ; MULTI_VALUED ; .*
8890
Names_List_Cross_Ref ; MULTI_VALUED ; .*

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,8 @@ Unikemet ; kEH_NoRotate
602602
Unikemet ; kEH_UniK
603603
Unikemet ; kEH_AltSeq
604604

605+
NamesList ; Names_List_Block_Header
606+
NamesList ; Names_List_Block_Header_Notice
605607
NamesList ; Names_List_Alias
606608
NamesList ; Names_List_Cross_Ref
607609
NamesList ; Names_List_Comment

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1548,6 +1548,46 @@ In \P{U6.0:Math_Class_Ex=None}, U6.0:Math_Class = U6.0:Math_Class_Ex
15481548

15491549
OnPairsOf $code_points, EqualityOf Block ⇔ EqualityOf Pretty_Block
15501550

1551+
# Names_List_Block_Header is different:
1552+
# - for blocks without a names list, the block header gives the assigned range,
1553+
# not the range of the block;
1554+
# - for planes 15 and 16, the block header with the block name only covers the
1555+
# last eight columns of the plane, for the chart that shows the noncharacters;
1556+
# - for planes 1–14, there is a block header Unassigned covering the last eight
1557+
# columns of the plane, again for the noncharacters.
1558+
Let $uncharted_block_ends := [
1559+
[
1560+
\p{Block=Hangul Syllables}
1561+
\p{Block=Tangut Supplement}
1562+
\p{Block=Jurchen}
1563+
\p{Block=/^CJK.Unified.Ideographs/}
1564+
] & \p{gc=Cn}
1565+
]
1566+
Let $noncharacter_charts := [
1567+
\x{1FF80}-\x{1FFFF}
1568+
\x{2FF80}-\x{2FFFF}
1569+
\x{3FF80}-\x{3FFFF}
1570+
\x{4FF80}-\x{4FFFF}
1571+
\x{5FF80}-\x{5FFFF}
1572+
\x{6FF80}-\x{6FFFF}
1573+
\x{7FF80}-\x{7FFFF}
1574+
\x{8FF80}-\x{8FFFF}
1575+
\x{9FF80}-\x{9FFFF}
1576+
\x{AFF80}-\x{AFFFF}
1577+
\x{BFF80}-\x{BFFFF}
1578+
\x{CFF80}-\x{CFFFF}
1579+
\x{DFF80}-\x{DFFFF}
1580+
\x{EFF80}-\x{EFFFF}
1581+
]
1582+
Let $uncharted_pua_planes := [
1583+
\x{F0000}-\x{FFF7F}
1584+
\x{100000}-\x{10FF7F}
1585+
]
1586+
OnPairsOf [$code_points - $noncharacter_charts - $uncharted_block_ends - $uncharted_pua_planes],
1587+
EqualityOf Block ⇔ EqualityOf Names_List_Block_Header
1588+
[$uncharted_block_ends $uncharted_pua_planes] ⊂ \p{Names_List_Block_Header=@none@}
1589+
[$noncharacter_charts] = \p{Names_List_Block_Header=Unassigned}
1590+
15511591
# Basic Propertywise tests.
15521592
Ignoring Name:
15531593

unicodetools/src/test/java/org/unicode/propstest/TestInvariants.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,13 +164,24 @@ public void TestUniformUnassigned() {
164164
UcdProperty.Names_List_Cross_Ref,
165165
UcdProperty.Names_List_Comment,
166166
UcdProperty.Names_List_Subheader,
167-
UcdProperty.Names_List_Subheader_Notice));
167+
UcdProperty.Names_List_Subheader_Notice,
168+
UcdProperty.Names_List_Block_Header,
169+
UcdProperty.Names_List_Block_Header_Notice));
168170
exceptions.putAll(
169171
General_Category_Values.Private_Use,
170-
Arrays.asList(UcdProperty.Age, UcdProperty.Block, UcdProperty.Pretty_Block));
172+
Arrays.asList(
173+
UcdProperty.Age,
174+
UcdProperty.Block,
175+
UcdProperty.Pretty_Block,
176+
UcdProperty.Names_List_Block_Header,
177+
UcdProperty.Names_List_Block_Header_Notice));
171178
exceptions.putAll(
172179
General_Category_Values.Surrogate,
173-
Arrays.asList(UcdProperty.Block, UcdProperty.Pretty_Block));
180+
Arrays.asList(
181+
UcdProperty.Block,
182+
UcdProperty.Pretty_Block,
183+
UcdProperty.Names_List_Block_Header,
184+
UcdProperty.Names_List_Block_Header_Notice));
174185

175186
List<UcdProperty> ordered = new ArrayList<>();
176187
// ordered.add(UcdProperty.Bidi_Class);

0 commit comments

Comments
 (0)