Skip to content

Commit 852f285

Browse files
authored
Unikemet grouped property attributes and newlines in TR38/57 (#1150)
* Fixed an issue where Unikemet properties weren't being applied correctly to ranges of characters. Improved handling of newlines and breaks in TR38/TR57.
1 parent 6f8c3d2 commit 852f285

File tree

7 files changed

+68
-22
lines changed

7 files changed

+68
-22
lines changed

unicodetools/data/ucdxml/dev/ucd.nounihan.grouped.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37269,7 +37269,7 @@
3726937269
<char cp="1342E" na="EGYPTIAN HIEROGLYPH AA032" kEH_Cat="T-06-015" kEH_Desc="A bow, of an archaic type, with the ends towards the back." kEH_Func="Logogram (Nubia)" kEH_FVal="sty" kEH_UniK="AA032" kEH_JSesh="Aa32" kEH_HG="AA32"/>
3727037270
<char cp="1342F" age="15.0" na="EGYPTIAN HIEROGLYPH V011D" lb="OP" kEH_Cat="V-03-021" kEH_Func="" kEH_UniK="V011D" kEH_IFAO="434,10"/>
3727137271
</group>
37272-
<group age="15.0" JSN="" gc="Cf" ccc="0" dt="none" dm="#" nt="None" nv="NaN" bc="L" bpt="n" bpb="#" Bidi_M="N" bmg="" suc="#" slc="#" stc="#" uc="#" lc="#" tc="#" scf="#" cf="#" jt="T" jg="No_Joining_Group" ea="N" lb="CM" sc="Egyp" scx="Egyp" Dash="N" WSpace="N" QMark="N" Radical="N" Ideo="N" UIdeo="N" IDSB="N" IDST="N" hst="NA" DI="N" ODI="N" Alpha="N" OAlpha="N" Upper="N" OUpper="N" Lower="N" OLower="N" Math="N" OMath="N" Hex="N" AHex="N" NChar="N" VS="N" Bidi_C="N" Join_C="N" Gr_Base="N" Gr_Ext="N" OGr_Ext="N" STerm="N" Ext="N" Term="N" Dia="N" Dep="N" IDS="N" OIDS="N" XIDS="N" IDC="Y" OIDC="N" XIDC="Y" SD="N" LOE="N" Pat_WS="N" Pat_Syn="N" GCB="CN" WB="FO" SB="FO" CE="N" Comp_Ex="N" NFC_QC="Y" NFD_QC="Y" NFKC_QC="Y" NFKD_QC="Y" CI="Y" Cased="N" CWCF="N" CWCM="N" CWKCF="N" CWL="N" CWT="N" CWU="N" NFKC_CF="#" InSC="Other" InPC="NA" PCM="N" vo="U" RI="N" blk="Egyptian_Hieroglyph_Format_Controls" na1="" Emoji="N" EPres="N" EMod="N" EBase="N" EComp="N" ExtPict="N" NFKC_SCF="#" ID_Compat_Math_Start="N" ID_Compat_Math_Continue="N" IDSU="N" InCB="None" MCM="N">
37272+
<group age="15.0" JSN="" gc="Cf" ccc="0" dt="none" dm="#" nt="None" nv="NaN" bc="L" bpt="n" bpb="#" Bidi_M="N" bmg="" suc="#" slc="#" stc="#" uc="#" lc="#" tc="#" scf="#" cf="#" jt="T" jg="No_Joining_Group" ea="N" lb="CM" sc="Egyp" scx="Egyp" Dash="N" WSpace="N" QMark="N" Radical="N" Ideo="N" UIdeo="N" IDSB="N" IDST="N" hst="NA" DI="N" ODI="N" Alpha="N" OAlpha="N" Upper="N" OUpper="N" Lower="N" OLower="N" Math="N" OMath="N" Hex="N" AHex="N" NChar="N" VS="N" Bidi_C="N" Join_C="N" Gr_Base="N" Gr_Ext="N" OGr_Ext="N" STerm="N" Ext="N" Term="N" Dia="N" Dep="N" IDS="N" OIDS="N" XIDS="N" IDC="Y" OIDC="N" XIDC="Y" SD="N" LOE="N" Pat_WS="N" Pat_Syn="N" GCB="CN" WB="FO" SB="FO" CE="N" Comp_Ex="N" NFC_QC="Y" NFD_QC="Y" NFKC_QC="Y" NFKD_QC="Y" CI="Y" Cased="N" CWCF="N" CWCM="N" CWKCF="N" CWL="N" CWT="N" CWU="N" NFKC_CF="#" InSC="Other" InPC="NA" PCM="N" vo="U" RI="N" blk="Egyptian_Hieroglyph_Format_Controls" na1="" Emoji="N" EPres="N" EMod="N" EBase="N" EComp="N" ExtPict="N" NFKC_SCF="#" ID_Compat_Math_Start="N" ID_Compat_Math_Continue="N" IDSU="N" InCB="None" MCM="N" kEH_Core="N" kEH_NoMirror="N" kEH_NoRotate="N" kEH_AltSeq="#">
3727337273
<char cp="13430" age="12.0" na="EGYPTIAN HIEROGLYPH VERTICAL JOINER" lb="GL" IDC="N" XIDC="N"/>
3727437274
<char cp="13431" age="12.0" na="EGYPTIAN HIEROGLYPH HORIZONTAL JOINER" lb="GL" IDC="N" XIDC="N"/>
3727537275
<char cp="13432" age="12.0" na="EGYPTIAN HIEROGLYPH INSERT AT TOP START" lb="GL" IDC="N" XIDC="N"/>
@@ -41306,7 +41306,7 @@
4130641306
<char cp="143F8" kEH_Cat="AA-22-002" kEH_Desc="A crescent moon shape written on top of a vertical line." kEH_Func="Phonemogram" kEH_FVal="smꜣ" kEH_UniK="AA025B"/>
4130741307
<char cp="143F9" kEH_Cat="AA-22-006" kEH_Core="N" kEH_UniK="HJ AA078" kEH_JSesh="Aa78" kEH_HG="AA78"/>
4130841308
<char cp="143FA" kEH_Cat="AA-22-007" kEH_Desc="Two vertical ovals, connected together at the long sides." kEH_Func="Phonemogram" kEH_FVal="ꜣb" kEH_UniK="HJ AA080" kEH_JSesh="Aa80" kEH_HG="AA80"/>
41309-
<reserved first-cp="143FB" last-cp="143FF" age="unassigned" na="" gc="Cn" lb="XX" sc="Zzzz" scx="Zzzz" Alpha="N" Gr_Base="N" IDS="N" XIDS="N" IDC="N" XIDC="N" WB="XX" SB="XX" kEH_Core="" kEH_NoMirror="" kEH_NoRotate="" kEH_AltSeq=""/>
41309+
<reserved first-cp="143FB" last-cp="143FF" age="unassigned" na="" gc="Cn" lb="XX" sc="Zzzz" scx="Zzzz" Alpha="N" Gr_Base="N" IDS="N" XIDS="N" IDC="N" XIDC="N" WB="XX" SB="XX" kEH_Core="N"/>
4131041310
</group>
4131141311
<group age="8.0" JSN="" gc="Lo" ccc="0" dt="none" dm="#" nt="None" nv="NaN" bc="L" bpt="n" bpb="#" Bidi_M="N" bmg="" suc="#" slc="#" stc="#" uc="#" lc="#" tc="#" scf="#" cf="#" jt="U" jg="No_Joining_Group" ea="N" lb="AL" sc="Hluw" scx="Hluw" Dash="N" WSpace="N" QMark="N" Radical="N" Ideo="N" UIdeo="N" IDSB="N" IDST="N" hst="NA" DI="N" ODI="N" Alpha="Y" OAlpha="N" Upper="N" OUpper="N" Lower="N" OLower="N" Math="N" OMath="N" Hex="N" AHex="N" NChar="N" VS="N" Bidi_C="N" Join_C="N" Gr_Base="Y" Gr_Ext="N" OGr_Ext="N" STerm="N" Ext="N" Term="N" Dia="N" Dep="N" IDS="Y" OIDS="N" XIDS="Y" IDC="Y" OIDC="N" XIDC="Y" SD="N" LOE="N" Pat_WS="N" Pat_Syn="N" GCB="XX" WB="LE" SB="LE" CE="N" Comp_Ex="N" NFC_QC="Y" NFD_QC="Y" NFKC_QC="Y" NFKD_QC="Y" CI="N" Cased="N" CWCF="N" CWCM="N" CWKCF="N" CWL="N" CWT="N" CWU="N" NFKC_CF="#" InSC="Other" InPC="NA" PCM="N" vo="U" RI="N" blk="Anatolian_Hieroglyphs" na1="" Emoji="N" EPres="N" EMod="N" EBase="N" EComp="N" ExtPict="N" NFKC_SCF="#" ID_Compat_Math_Start="N" ID_Compat_Math_Continue="N" IDSU="N" InCB="None" MCM="N">
4131241312
<char cp="14400" na="ANATOLIAN HIEROGLYPH A001"/>

unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,6 @@ public boolean isUnifiedIdeograph(int codepoint) {
355355
}
356356

357357
public boolean isUnikemetAttributeRange(int codepoint) {
358-
return !getAttributeValue(UcdProperty.kEH_Cat, codepoint).isEmpty();
358+
return getAttributeValue(UcdProperty.Block, codepoint).startsWith("Egyptian_Hieroglyph");
359359
}
360360
}

unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1830,7 +1830,9 @@ private static HashMap<String, TRDetails> parseTR(String url)
18301830
isList,
18311831
matcher.group(3)
18321832
.trim()
1833-
.replaceAll("<br>", "")
1833+
.replaceAll("\r", "")
1834+
.replaceAll("<br>", "\n")
1835+
.replaceAll("\n\n", "\n")
18341836
.replaceAll("<span class=\"removed\">.*?</span>", "")
18351837
.replaceAll("<span class=\"changed\">", "")
18361838
.replaceAll("</span>", ""));

unicodetools/src/main/java/org/unicode/xml/UCDXML.java

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -403,10 +403,16 @@ private static int buildChars(
403403
ucdVersion,
404404
range,
405405
rangeType,
406-
groupAttrs);
406+
groupAttrs,
407+
outputRange);
407408
} else {
408409
buildUngroupedRange(
409-
writer, attributeResolver, ucdVersion, range, rangeType);
410+
writer,
411+
attributeResolver,
412+
ucdVersion,
413+
range,
414+
rangeType,
415+
outputRange);
410416
}
411417
}
412418
range.clear();
@@ -424,10 +430,16 @@ private static int buildChars(
424430
ucdVersion,
425431
range,
426432
rangeType,
427-
groupAttrs);
433+
groupAttrs,
434+
outputRange);
428435
} else {
429436
buildUngroupedRange(
430-
writer, attributeResolver, ucdVersion, range, rangeType);
437+
writer,
438+
attributeResolver,
439+
ucdVersion,
440+
range,
441+
rangeType,
442+
outputRange);
431443
}
432444
}
433445
range.clear();
@@ -454,9 +466,16 @@ private static int buildChars(
454466
if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) {
455467
if (outputType == UCDXMLOUTPUTTYPE.GROUPED) {
456468
buildGroupedRange(
457-
writer, attributeResolver, ucdVersion, range, rangeType, groupAttrs);
469+
writer,
470+
attributeResolver,
471+
ucdVersion,
472+
range,
473+
rangeType,
474+
groupAttrs,
475+
outputRange);
458476
} else {
459-
buildUngroupedRange(writer, attributeResolver, ucdVersion, range, rangeType);
477+
buildUngroupedRange(
478+
writer, attributeResolver, ucdVersion, range, rangeType, outputRange);
460479
}
461480
}
462481
}
@@ -544,10 +563,11 @@ private static void buildGroupedRange(
544563
VersionInfo ucdVersion,
545564
ArrayList<Integer> range,
546565
Range rangeType,
547-
AttributesImpl groupAttrs)
566+
AttributesImpl groupAttrs,
567+
UCDXMLOUTPUTRANGE outputRange)
548568
throws SAXException {
549569
AttributesImpl orgRangeAttributes =
550-
getReservedAttributes(ucdVersion, attributeResolver, range);
570+
getReservedAttributes(ucdVersion, attributeResolver, range, outputRange);
551571
AttributesImpl rangeAttributes = new AttributesImpl();
552572
if (range.size() == 1) {
553573
rangeAttributes.addAttribute(
@@ -594,10 +614,11 @@ private static void buildUngroupedRange(
594614
AttributeResolver attributeResolver,
595615
VersionInfo ucdVersion,
596616
ArrayList<Integer> range,
597-
Range rangeType)
617+
Range rangeType,
618+
UCDXMLOUTPUTRANGE outputRange)
598619
throws SAXException {
599620
AttributesImpl rangeAttributes =
600-
getReservedAttributes(ucdVersion, attributeResolver, range);
621+
getReservedAttributes(ucdVersion, attributeResolver, range, outputRange);
601622
writer.startElement(rangeType.tag, rangeAttributes);
602623
{
603624
writer.endElement(rangeType.tag);
@@ -700,6 +721,7 @@ private static AttributesImpl getGroupAttributes(
700721
&& (propDetail.getMaxVersion() == null
701722
|| version.compareTo(propDetail.getMaxVersion()) < 0)) {
702723
int totalCount = 0;
724+
int unassignedCount = 0;
703725
Map<String, Integer> counters = new LinkedHashMap<>();
704726

705727
for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) {
@@ -796,7 +818,10 @@ private static boolean getIsAttributeIncluded(
796818
}
797819

798820
private static AttributesImpl getReservedAttributes(
799-
VersionInfo version, AttributeResolver attributeResolver, ArrayList<Integer> range) {
821+
VersionInfo version,
822+
AttributeResolver attributeResolver,
823+
ArrayList<Integer> range,
824+
UCDXMLOUTPUTRANGE outputRange) {
800825
AttributesImpl attributes = new AttributesImpl();
801826

802827
if (range.size() == 1) {
@@ -816,17 +841,30 @@ private static AttributesImpl getReservedAttributes(
816841
"CDATA",
817842
attributeResolver.getHexString(range.get(range.size() - 1)));
818843
}
819-
for (UCDPropertyDetail propDetail : UCDPropertyDetail.baseValues()) {
844+
for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) {
820845
UcdProperty prop = propDetail.getUcdProperty();
821846
if (version.compareTo(propDetail.getMinVersion()) >= 0
822847
&& (propDetail.getMaxVersion() == null
823848
|| version.compareTo(propDetail.getMaxVersion()) <= 0)) {
824849
String attrValue =
825850
attributeResolver.getAttributeValue(
826851
propDetail.getUcdProperty(), range.get(0));
827-
828-
attributes.addAttribute(
829-
NAMESPACE, prop.getShortName(), prop.getShortName(), "CDATA", attrValue);
852+
boolean isAttributeIncluded =
853+
getIsAttributeIncluded(
854+
attrValue,
855+
attributeResolver.isUnihanAttributeRange(range.get(0)),
856+
attributeResolver.isUnikemetAttributeRange(range.get(0)),
857+
propDetail,
858+
prop,
859+
outputRange);
860+
if (isAttributeIncluded) {
861+
attributes.addAttribute(
862+
NAMESPACE,
863+
prop.getShortName(),
864+
prop.getShortName(),
865+
"CDATA",
866+
attrValue);
867+
}
830868
}
831869
}
832870
return attributes;

unicodetools/src/main/resources/org/unicode/uax42/fragments/Unikemet.xml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
attribute kEH_FVal { text }?
1717

1818
code-point-attributes &amp;= attribute kEH_UniK
19-
{ xsd:string { pattern="([A-IK-Z]|AA|NL|NU)\d{3}[A-Z]{0,2}|HJ ([A-IK-Z]|AA)\d{3}[A-Z]{0,2}" } }?
19+
{ xsd:string { pattern="([A-IK-Z]|AA|NL|NU)\d{3}[A-Z]{0,2}" }
20+
| xsd:string { pattern="HJ ([A-IK-Z]|AA)\d{3}[A-Z]{0,2}" }
21+
}?
2022

2123
code-point-attributes &amp;= attribute kEH_JSesh
2224
{ list { ( xsd:string { pattern="([A-IK-Z]|Aa|NL|NU|Ff)\d{1,3}[A-Za-z]{0,5}" }

unicodetools/src/main/resources/org/unicode/uax42/output/tr42.html

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2625,7 +2625,9 @@ <h4>
26252625
attribute kEH_FVal { text }?
26262626

26272627
code-point-attributes &amp;= attribute kEH_UniK
2628-
{ xsd:string { pattern="([A-IK-Z]|AA|NL|NU)\d{3}[A-Z]{0,2}|HJ ([A-IK-Z]|AA)\d{3}[A-Z]{0,2}" } }?
2628+
{ xsd:string { pattern="([A-IK-Z]|AA|NL|NU)\d{3}[A-Z]{0,2}" }
2629+
| xsd:string { pattern="HJ ([A-IK-Z]|AA)\d{3}[A-Z]{0,2}" }
2630+
}?
26292631

26302632
code-point-attributes &amp;= attribute kEH_JSesh
26312633
{ list { ( xsd:string { pattern="([A-IK-Z]|Aa|NL|NU|Ff)\d{1,3}[A-Za-z]{0,5}" }

unicodetools/src/main/resources/org/unicode/uax42/output/tr42.rnc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,9 @@
13791379
attribute kEH_FVal { text }?
13801380

13811381
code-point-attributes &= attribute kEH_UniK
1382-
{ xsd:string { pattern="([A-IK-Z]|AA|NL|NU)\d{3}[A-Z]{0,2}|HJ ([A-IK-Z]|AA)\d{3}[A-Z]{0,2}" } }?
1382+
{ xsd:string { pattern="([A-IK-Z]|AA|NL|NU)\d{3}[A-Z]{0,2}" }
1383+
| xsd:string { pattern="HJ ([A-IK-Z]|AA)\d{3}[A-Z]{0,2}" }
1384+
}?
13831385

13841386
code-point-attributes &= attribute kEH_JSesh
13851387
{ list { ( xsd:string { pattern="([A-IK-Z]|Aa|NL|NU|Ff)\d{1,3}[A-Za-z]{0,5}" }

0 commit comments

Comments
 (0)