From f6746bf2c2be23c5d11bfe226411ab0a7644e373 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Thu, 6 Feb 2025 12:51:31 -0800 Subject: [PATCH 01/10] Merged changes manually from ucdxml --- .gitignore | 1 + docs/ucdxml.md | 22 + .../unicode/props/PropertyParsingInfo.java | 18 +- .../java/org/unicode/props/UcdProperty.java | 6 + .../org/unicode/props/UcdPropertyValues.java | 37 + .../java/org/unicode/props/VersionToAge.java | 2 + .../org/unicode/xml/AttributeResolver.java | 336 ++ .../java/org/unicode/xml/CompareUcdXML.java | 197 + .../unicode/xml/GeneratePropertyValues.java | 1749 +++++++++ .../java/org/unicode/xml/UCDDataResolver.java | 210 + .../java/org/unicode/xml/UCDXMLWriter.java | 74 + .../org/unicode/xml/UcdPropertyDetail.java | 2356 +++++++++++ .../org/unicode/xml/UcdSectionComponent.java | 28 + .../org/unicode/xml/UcdSectionDetail.java | 224 ++ .../src/main/java/org/unicode/xml/UcdXML.java | 825 ++++ .../java/org/unicode/xml/XMLProperties.java | 482 +++ .../unicode/props/ExtraPropertyAliases.txt | 8 +- .../props/ExtraPropertyValueAliases.txt | 6 +- .../org/unicode/props/IndexPropertyRegex.txt | 53 +- .../unicode/props/IndexUnicodeProperties.txt | 23 +- .../org/unicode/uax42/fragments/Bidi_C.xml | 5 + .../org/unicode/uax42/fragments/Bidi_M.xml | 5 + .../org/unicode/uax42/fragments/Emoji.xml | 20 + .../org/unicode/uax42/fragments/InCB.xml | 9 + .../org/unicode/uax42/fragments/InPC.xml | 21 + .../org/unicode/uax42/fragments/InSC.xml | 42 + .../org/unicode/uax42/fragments/JSN.xml | 5 + .../org/unicode/uax42/fragments/Join_C.xml | 5 + .../unicode/uax42/fragments/Name_Alias.xml | 10 + .../org/unicode/uax42/fragments/Nushu.xml | 8 + .../uax42/fragments/Set_of_code_points.xml | 8 + .../org/unicode/uax42/fragments/Tangut.xml | 18 + .../org/unicode/uax42/fragments/Unihan.xml | 347 ++ .../org/unicode/uax42/fragments/age.xml | 23 + .../org/unicode/uax42/fragments/bc.xml | 17 + .../org/unicode/uax42/fragments/blk.xml | 344 ++ .../org/unicode/uax42/fragments/block.xml | 10 + .../org/unicode/uax42/fragments/bmg.xml | 5 + .../org/unicode/uax42/fragments/boolean.xml | 4 + .../unicode/uax42/fragments/boundaries.xml | 58 + .../org/unicode/uax42/fragments/bpb.xml | 5 + .../org/unicode/uax42/fragments/bpt.xml | 5 + .../unicode/uax42/fragments/case_folding.xml | 8 + .../unicode/uax42/fragments/case_mapping.xml | 11 + .../unicode/uax42/fragments/case_other.xml | 32 + .../org/unicode/uax42/fragments/casing.xml | 14 + .../org/unicode/uax42/fragments/ccc.xml | 5 + .../unicode/uax42/fragments/cjk-radicals.xml | 10 + .../org/unicode/uax42/fragments/cjkEACC.xml | 5 + .../uax42/fragments/cjkIRG_TSource.xml | 6 + .../unicode/uax42/fragments/composition.xml | 8 + .../org/unicode/uax42/fragments/datatypes.xml | 5 + .../uax42/fragments/datatypes_code_points.xml | 9 + .../unicode/uax42/fragments/decomposition.xml | 11 + .../unicode/uax42/fragments/description.xml | 6 + .../unicode/uax42/fragments/do-not-emit.xml | 22 + .../org/unicode/uax42/fragments/ea.xml | 5 + .../unicode/uax42/fragments/emoji-sources.xml | 11 + .../uax42/fragments/function_graphic.xml | 68 + .../org/unicode/uax42/fragments/gc.xml | 12 + .../org/unicode/uax42/fragments/groups.xml | 8 + .../org/unicode/uax42/fragments/hst.xml | 5 + .../unicode/uax42/fragments/identifier.xml | 26 + .../unicode/uax42/fragments/ideographs.xml | 23 + .../org/unicode/uax42/fragments/isc.xml | 5 + .../uax42/fragments/jis-code-point.xml | 5 + .../org/unicode/uax42/fragments/joining.xml | 53 + .../org/unicode/uax42/fragments/lb.xml | 24 + .../unicode/uax42/fragments/miscellaneous.xml | 11 + .../org/unicode/uax42/fragments/na.xml | 13 + .../org/unicode/uax42/fragments/na1.xml | 5 + .../uax42/fragments/named-sequences.xml | 15 + .../org/unicode/uax42/fragments/namespace.xml | 5 + .../fragments/normalization-corrections.xml | 11 + .../org/unicode/uax42/fragments/numeric.xml | 8 + .../org/unicode/uax42/fragments/pattern.xml | 8 + .../unicode/uax42/fragments/quickcheck.xml | 31 + .../unicode/uax42/fragments/repertoire.xml | 6 + .../fragments/repertoire_Code_points.xml | 23 + .../org/unicode/uax42/fragments/script.xml | 49 + .../uax42/fragments/simple_case_mapping.xml | 11 + .../uax42/fragments/standardized-variants.xml | 10 + .../org/unicode/uax42/fragments/start.xml | 6 + .../resources/org/unicode/uax42/index.xml | 1353 +++++++ .../org/unicode/uax42/index2html.xsl | 611 +++ .../resources/org/unicode/uax42/index2rnc.xsl | 45 + .../org/unicode/uax42/output/index.html | 3482 +++++++++++++++++ .../org/unicode/uax42/output/index.rnc | 1455 +++++++ .../main/resources/org/unicode/uax42/pom.xml | 72 + 89 files changed, 15221 insertions(+), 37 deletions(-) create mode 100644 docs/ucdxml.md create mode 100644 unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdXML.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/XMLProperties.java create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/index.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/output/index.html create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/pom.xml diff --git a/.gitignore b/.gitignore index 60e7ec63ef..c6d5a34bd2 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ perf-*.xml test-*.xml # Directories +.idea/ .settings/ .vs/ .vscode/ diff --git a/docs/ucdxml.md b/docs/ucdxml.md new file mode 100644 index 0000000000..207842db2a --- /dev/null +++ b/docs/ucdxml.md @@ -0,0 +1,22 @@ +# Generating TR42 + +## Step 1 - Generate property value fragments + +- mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.GeneratePropertyValues"' '-Dexec.args="--ucdversion 16.0.0 -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42/fragments; pwd)"' -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) + +## Step 2 - Generate TR42 index.html and index.rnc + +- mvn xml:transform -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42/fragments; pwd) -Doutputdir=../Generated/uax42/ + +## Step 3 - Validate generated UAX XML files + +You'll need a [RELAX NG](https://relaxng.org/) schema validator. We'll use [jing-trang](https://github. +com/relaxng/jing-trang) in this example. + +1. Clone and build [jing-trang](https://github.com/relaxng/jing-trang) +2. Run the following: + ``` + java -jar C:\_git\jing-trang\build\jing.jar -c UNICODETOOLS_REPO_DIR\uax\uax42\output\index.rnc + ``` + Note that the UAX xml file has to be saved as NFD as the Unihan syntax regular expressions are expecting NFD. + diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java index 6c794380e5..9e4ebda092 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java @@ -760,10 +760,20 @@ private static void parsePropertyValueFile( assert propInfo.property.getType() == PropertyType.Binary; value = "Yes"; } else { - value = - propInfo.property.getType() == PropertyType.Binary - ? "Yes" - : line.getParts()[2]; + if (propInfo.property.getType() == PropertyType.Binary) { + // Handle @missing values for binary attributes (see 13.0.0 emoji-data.txt) + if (line.getParts().length == 3) { + if (line.getParts()[2].equals("No")) { + value = "No"; + } else { + value = "Yes"; + } + } else { + value = "Yes"; + } + } else { + value = line.getParts()[2]; + } // The value should not be an empty string. // Exception: NFKC_Casefold does remove some characters by mapping them to nothing. assert !value.isEmpty() diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index fd9e5b7a3f..d2e0c665ac 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -35,6 +35,7 @@ import org.unicode.props.UcdPropertyValues.Sentence_Break_Values; import org.unicode.props.UcdPropertyValues.Vertical_Orientation_Values; import org.unicode.props.UcdPropertyValues.Word_Break_Values; +import org.unicode.props.UcdPropertyValues.kEH_Core_Values; /** * Machine-generated file for properties, produced by GenerateEnums.java from PropertyAliases.txt @@ -84,12 +85,16 @@ public enum UcdProperty { Emoji_SB(PropertyType.Miscellaneous, "ESB"), ISO_Comment(PropertyType.Miscellaneous, "isc"), Jamo_Short_Name(PropertyType.Miscellaneous, "JSN"), + NC_Corrected(PropertyType.Miscellaneous, "ncCorrected"), + NC_Original(PropertyType.Miscellaneous, "ncOriginal"), + NC_Version(PropertyType.Miscellaneous, "ncVersion"), Name(PropertyType.Miscellaneous, "na"), Name_Alias(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "Name_Alias"), Named_Sequences(PropertyType.Miscellaneous, "NS"), Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"), Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"), Unicode_1_Name(PropertyType.Miscellaneous, "na1"), + emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"), kAlternateTotalStrokes(PropertyType.Miscellaneous, "cjkAlternateTotalStrokes"), kBigFive(PropertyType.Miscellaneous, "cjkBigFive"), kCCCII(PropertyType.Miscellaneous, "cjkCCCII"), @@ -224,6 +229,7 @@ public enum UcdProperty { kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"), kXerox(PropertyType.Miscellaneous, "cjkXerox"), kZVariant(PropertyType.Miscellaneous, "cjkZVariant"), + kZhuang(PropertyType.Miscellaneous, "cjkZhuang"), kZhuangNumeric(PropertyType.Miscellaneous, "cjkZhuangNumeric"), // Catalog diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index 37020e727b..7ca3ea80c7 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -68,6 +68,7 @@ public enum Age_Values implements Named { V15_0("15.0"), V15_1("15.1"), V16_0("16.0"), + V17_0("17.0"), Unassigned("NA"); private final PropertyNames names; @@ -756,6 +757,7 @@ public static East_Asian_Width_Values forName(String name) { // Emoji_DCM // Emoji_KDDI // Emoji_SB + // emoji_variation_sequence // Equivalent_Unified_Ideograph // FC_NFKC_Closure public enum General_Category_Values implements Named { @@ -1281,6 +1283,7 @@ public enum Joining_Group_Values implements Named { Heth("Heth"), Kaf("Kaf"), Kaph("Kaph"), + Kashmiri_Yeh("Kashmiri_Yeh"), Khaph("Khaph"), Knotted_Heh("Knotted_Heh"), Lam("Lam"), @@ -1434,6 +1437,36 @@ public static Joining_Type_Values forName(String name) { // kDefinition // kEACC // kEH_Cat + public enum kEH_Core_Values implements Named { + Core("C"), + Legacy("L"), + None("N"); + private final PropertyNames names; + + private kEH_Core_Values(String shortName, String... otherNames) { + names = + new PropertyNames( + kEH_Core_Values.class, this, shortName, otherNames); + } + + @Override + public PropertyNames getNames() { + return names; + } + + @Override + public String getShortName() { + return names.getShortName(); + } + + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(kEH_Core_Values.class); + + public static kEH_Core_Values forName(String name) { + return NAME_MATCHER.get(name); + } + } + // kEH_Desc // kEH_Func // kEH_FVal @@ -1537,6 +1570,7 @@ public static Joining_Type_Values forName(String name) { // kVietnameseNumeric // kXerox // kXHC1983 + // kZhuang // kZhuangNumeric // kZVariant public enum Line_Break_Values implements Named { @@ -1619,6 +1653,9 @@ public static Line_Break_Values forName(String name) { // Name_Alias // Named_Sequences // Named_Sequences_Prov + // NC_Corrected + // NC_Original + // NC_Version public enum NFC_Quick_Check_Values implements Named { Maybe("M"), No("N"), diff --git a/unicodetools/src/main/java/org/unicode/props/VersionToAge.java b/unicodetools/src/main/java/org/unicode/props/VersionToAge.java index 86da1e5e0a..5f0ff0d5ce 100644 --- a/unicodetools/src/main/java/org/unicode/props/VersionToAge.java +++ b/unicodetools/src/main/java/org/unicode/props/VersionToAge.java @@ -41,6 +41,7 @@ public enum VersionToAge { ucd( ImmutableMap.builder() + .put(VersionInfo.getInstance(17, 0), getDate(2025, 9)) .put(VersionInfo.getInstance(16, 0), getDate(2024, 9)) .put(VersionInfo.getInstance(15, 1), getDate(2023, 9)) .put(VersionInfo.getInstance(15, 0), getDate(2022, 9)) @@ -73,6 +74,7 @@ public enum VersionToAge { emoji( ImmutableMap.builder() + .put(VersionInfo.getInstance(17, 0), getDate(2025, 9)) .put(VersionInfo.getInstance(16, 0), getDate(2024, 9)) .put(VersionInfo.getInstance(15, 1), getDate(2023, 9)) .put(VersionInfo.getInstance(15, 0), getDate(2022, 9)) diff --git a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java new file mode 100644 index 0000000000..393bb32815 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java @@ -0,0 +1,336 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.util.VersionInfo; +import java.util.*; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.props.*; + +public class AttributeResolver { + + private final IndexUnicodeProperties indexUnicodeProperties; + private final UnicodeMap map_age; + private final UnicodeMap map_block; + private final UnicodeMap map_decomposition_type; + private final UnicodeMap map_general_category; + private final UnicodeMap map_script; + private final UnicodeMap map_script_extensions; + private final HashMap> map_NameAlias; + + // If there is a change in any of these properties between two adjacent characters, it will + // result in a new range. + private final UcdPropertyDetail[] rangeDefiningPropertyDetails = { + UcdPropertyDetail.Age_Detail, + UcdPropertyDetail.Bidi_Class_Detail, + UcdPropertyDetail.Block_Detail, + UcdPropertyDetail.Decomposition_Mapping_Detail, + UcdPropertyDetail.Numeric_Type_Detail, + UcdPropertyDetail.Numeric_Value_Detail, + UcdPropertyDetail.Vertical_Orientation_Detail + }; + + public AttributeResolver(IndexUnicodeProperties iup) { + indexUnicodeProperties = iup; + map_age = indexUnicodeProperties.loadEnum(UcdProperty.Age); + map_block = indexUnicodeProperties.loadEnum(UcdProperty.Block); + map_decomposition_type = indexUnicodeProperties.loadEnum(UcdProperty.Decomposition_Type); + map_general_category = indexUnicodeProperties.loadEnum(UcdProperty.General_Category); + map_script = indexUnicodeProperties.loadEnum(UcdProperty.Script); + map_script_extensions = + indexUnicodeProperties.getProperty(UcdProperty.Script_Extensions).getUnicodeMap(); + + // UCD code is only set up to read a single Alias value from NameAliases.txt + // Instead, we'll load the Alias and the Type data as part of the constructor. We'll keep in + // memory as it + // NameAliases isn't too large. + map_NameAlias = loadNameAliases(); + } + + protected enum AliasType { + ABBREVIATION("abbreviation"), + ALTERNATE("alternate"), + CONTROL("control"), + CORRECTION("correction"), + FIGMENT("figment"), + NONE("none"); + + private final String aliasType; + + AliasType(String aliasType) { + this.aliasType = aliasType; + } + + public String toString() { + return aliasType; + } + } + + private static class NameAlias { + + private String alias; + private final AliasType type; + + private NameAlias(String alias, AliasType type) { + this.alias = alias; + this.type = type; + } + + public String getAlias() { + return alias; + } + + public AliasType getType() { + return type; + } + } + + private static class NameAliasComparator implements java.util.Comparator { + + @Override + public int compare(NameAlias o1, NameAlias o2) { + return o1.getAlias().compareTo(o2.getAlias()); + } + } + + private HashMap> loadNameAliases() { + HashMap> nameAliasesByCodepoint = new HashMap<>(); + final PropertyParsingInfo fileInfo = + PropertyParsingInfo.getPropertyInfo(UcdProperty.Name_Alias); + String fullFilename = fileInfo.getFullFileName(indexUnicodeProperties.getUcdVersion()); + UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename)); + NameAliasComparator nameAliasComparator = new NameAliasComparator(); + + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + int codepoint = Integer.parseInt(parts[0], 16); + NameAlias nameAlias; + if (parts.length < 3) { + nameAlias = new NameAlias(parts[1], AliasType.NONE); + } else { + nameAlias = + new NameAlias( + parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT))); + } + + if (nameAliasesByCodepoint.containsKey(codepoint)) { + LinkedList nameAliases = + new LinkedList<>(nameAliasesByCodepoint.get(codepoint)); + nameAliases.add(nameAlias); + nameAliases.sort(nameAliasComparator); + nameAliasesByCodepoint.replace(codepoint, nameAliases); + } else { + nameAliasesByCodepoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); + } + } + return nameAliasesByCodepoint; + } + + public String getAttributeValue(UcdProperty prop, int codepoint) { + String resolvedValue = indexUnicodeProperties.getResolvedValue(prop, codepoint); + switch (prop.getType()) { + case Numeric: + switch (prop) { + case kOtherNumeric: + case kPrimaryNumeric: + case kAccountingNumeric: + return (resolvedValue.equals("NaN")) ? null : resolvedValue; + default: + return Optional.ofNullable(resolvedValue).orElse("NaN"); + } + case String: + switch (prop) { + case Equivalent_Unified_Ideograph: + String EqUIdeo = getMappingValue(codepoint, resolvedValue, false, ""); + return (EqUIdeo.equals("#")) ? null : EqUIdeo; + case kCompatibilityVariant: + String kCompatibilityVariant = + getMappingValue(codepoint, resolvedValue, false, "U+"); + return (kCompatibilityVariant.equals("#")) ? "" : kCompatibilityVariant; + case kSimplifiedVariant: + case kTraditionalVariant: + String kVariant = + getMappingValue( + codepoint, + resolvedValue, + isUnihanAttributeRange(codepoint), + "U+"); + return (kVariant.equals("#")) ? "" : kVariant; + case Bidi_Mirroring_Glyph: + // Returning empty string for bmg to maintain compatibility with older + // generated files. + String bmg = getMappingValue(codepoint, resolvedValue, false, ""); + return (bmg.equals("#")) ? "" : bmg; + default: + return getMappingValue(codepoint, resolvedValue, false, ""); + } + case Miscellaneous: + switch (prop) { + case Jamo_Short_Name: + // return map_jamo_short_name.get(codepoint).getShortName(); + return Optional.ofNullable(resolvedValue).orElse(""); + case Name: + if (resolvedValue != null + && resolvedValue.startsWith("CJK UNIFIED IDEOGRAPH-")) { + return "CJK UNIFIED IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("CJK COMPATIBILITY IDEOGRAPH-")) { + return "CJK COMPATIBILITY IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("TANGUT IDEOGRAPH-")) { + return "TANGUT IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("KHITAN SMALL SCRIPT CHARACTER-")) { + return "KHITAN SMALL SCRIPT CHARACTER-#"; + } + if (resolvedValue != null && resolvedValue.startsWith("NUSHU CHARACTER-")) { + return "NUSHU CHARACTER-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("EGYPTIAN HIEROGLYPH-")) { + return "EGYPTIAN HIEROGLYPH-#"; + } + return Optional.ofNullable(resolvedValue).orElse(""); + case kDefinition: + return resolvedValue; + default: + if (resolvedValue != null) { + return resolvedValue.replaceAll("\\|", " "); + } + return ""; + } + case Catalog: + switch (prop) { + case Age: + String age = map_age.get(codepoint).getShortName(); + return (age.equals("NA")) ? "unassigned" : age; + case Block: + return map_block.get(codepoint).getShortName(); + case Script: + return map_script.get(codepoint).getShortName(); + case Script_Extensions: + StringBuilder extensionBuilder = new StringBuilder(); + String[] extensions = map_script_extensions.get(codepoint).split("\\|", 0); + for (String extension : extensions) { + extensionBuilder.append( + UcdPropertyValues.Script_Values.valueOf(extension) + .getShortName()); + extensionBuilder.append(" "); + } + return extensionBuilder.toString().trim(); + default: + throw new RuntimeException("Missing Catalog case"); + } + case Enumerated: + switch (prop) { + case Decomposition_Type: + // Returning lower case to maintain compatibility with older generated + // files. + return map_decomposition_type + .get(codepoint) + .getShortName() + .toLowerCase(Locale.ROOT); + default: + final UnicodeProperty property = indexUnicodeProperties.getProperty(prop); + final List valueAliases = property.getValueAliases(property.getValue(codepoint)); + return valueAliases.get(0); + } + case Binary: + { + switch (resolvedValue) { + // Seems overkill to get this from UcdPropertyValues.Binary + case "No": + return "N"; + case "Yes": + return "Y"; + default: + throw new RuntimeException("Unexpected Binary value"); + } + } + default: + throw new RuntimeException("Missing PropertyType case"); + } + } + + public boolean isUnassignedCodepoint(int codepoint) { + return UcdPropertyValues.General_Category_Values.Unassigned.equals(getgc(codepoint)) + || UcdPropertyValues.General_Category_Values.Private_Use.equals(getgc(codepoint)) + || UcdPropertyValues.General_Category_Values.Surrogate.equals(getgc(codepoint)); + } + + public UcdPropertyValues.General_Category_Values getgc(int codepoint) { + return map_general_category.get(codepoint); + } + + public String getNChar(int codepoint) { + return getAttributeValue(UcdProperty.Noncharacter_Code_Point, codepoint); + } + + public HashMap getNameAliases(int codepoint) { + HashMap nameAliases = new LinkedHashMap<>(); + LinkedList nameAliasList = map_NameAlias.get(codepoint); + if (null != nameAliasList && !nameAliasList.isEmpty()) { + for (NameAlias nameAlias : nameAliasList) { + nameAliases.put(nameAlias.getAlias(), nameAlias.getType().toString()); + } + return nameAliases; + } + return null; + } + + private String getMappingValue( + int codepoint, String resolvedValue, boolean ignoreUnihanRange, String prefix) { + if (null == resolvedValue) { + return "#"; + } + int[] resolvedValueInts = resolvedValue.codePoints().toArray(); + if (resolvedValueInts.length == 1 + && resolvedValueInts[0] == codepoint + && !ignoreUnihanRange) { + return "#"; + } + StringBuilder sb = new StringBuilder(); + for (int i : resolvedValueInts) { + sb.append(prefix).append(getCPString(i)).append(" "); + } + return sb.toString().trim(); + } + + public boolean isDifferentRange(VersionInfo ucdVersion, int codepointA, int codepointB) { + boolean isDifference = false; + for (UcdPropertyDetail propDetail : rangeDefiningPropertyDetails) { + UcdProperty prop = propDetail.getUcdProperty(); + if (ucdVersion.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || ucdVersion.compareTo(propDetail.getMaxVersion()) < 0)) { + isDifference = + isDifference + || !getAttributeValue(prop, codepointA) + .equals(getAttributeValue(prop, codepointB)); + } + } + return isDifference; + } + + private static String getCPString(int codepoint) { + return String.format("%4s", Integer.toHexString(codepoint)) + .replace(" ", "0") + .toUpperCase(Locale.ROOT); + } + + public String getHexString(int codepoint) { + return getCPString(codepoint); + } + + public boolean isUnihanAttributeRange(int codepoint) { + return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y") + || !getAttributeValue(UcdProperty.kCompatibilityVariant, codepoint).isEmpty(); + } + + public boolean isUnifiedIdeograph(int codepoint) { + return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y") + && getAttributeValue(UcdProperty.Name, codepoint).equals("CJK UNIFIED IDEOGRAPH-#"); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java b/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java new file mode 100644 index 0000000000..52d3421e23 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java @@ -0,0 +1,197 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import java.io.*; +import java.util.HashMap; +import java.util.Objects; +import org.unicode.props.UcdProperty; + +public class CompareUcdXML { + + private static final String NEWLINE = System.getProperty("line.separator"); + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("fileA", 'a', UOption.REQUIRES_ARG), + UOption.create("fileB", 'b', UOption.REQUIRES_ARG) + }; + + private static final UcdProperty[] codepointSequenceProperties = + new UcdProperty[] { + UcdProperty.Named_Sequences, + UcdProperty.Named_Sequences_Prov, + UcdProperty.Standardized_Variant, + UcdProperty.Emoji_DCM, + UcdProperty.Emoji_KDDI, + UcdProperty.Emoji_SB, + UcdProperty.Do_Not_Emit_Preferred + }; + + private static final HashMap knownDifferences; + + static { + knownDifferences = new HashMap<>(); + + // https://github.com/unicode-org/properties/issues/296 + knownDifferences.put(0x31E4, new String[] {"Hani", "Zyyy"}); + knownDifferences.put(0x31E5, new String[] {"Hani", "Zyyy"}); + + // https://github.com/unicode-org/unicodetools/issues/325 + knownDifferences.put(0x109F7, new String[] {"1/6", "2/12"}); + knownDifferences.put(0x109F8, new String[] {"1/4", "3/12"}); + knownDifferences.put(0x109F9, new String[] {"1/3", "4/12"}); + knownDifferences.put(0x109FB, new String[] {"1/2", "6/12"}); + knownDifferences.put(0x109FD, new String[] {"2/3", "8/12"}); + knownDifferences.put(0x109FE, new String[] {"3/4", "9/12"}); + knownDifferences.put(0x109FF, new String[] {"5/6", "10/12"}); + + // https://github.com/unicode-org/properties/issues/172 + knownDifferences.put(0x5146, new String[] {"1000000", "1000000 1000000000000"}); + knownDifferences.put(0x79ED, new String[] {"1000000000", "1000000000 1000000000000"}); + } + + private static final int HELP = 0, FILE_A = 1, FILE_B = 2, LOGFILE = 3; + + public static void main(String[] args) throws Exception { + File fileA = null; + File fileB = null; + int errorCount = 0; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println("CompareUcdXML --fileA {file path} --fileB {file path}"); + System.exit(0); + } + + if (options[FILE_A].doesOccur) { + try { + fileA = new File(options[FILE_A].value); + if (!fileA.exists()) { + throw new IOException(); + } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find " + options[FILE_A].value); + } + } else { + throw new IllegalArgumentException("Missing command line option: --fileA (or -a)"); + } + + if (options[FILE_B].doesOccur) { + try { + fileB = new File(options[FILE_B].value); + if (!fileB.exists()) { + throw new IOException(); + } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find " + options[FILE_B].value); + } + } else { + throw new IllegalArgumentException("Missing command line option: --fileB (or -b)"); + } + + System.out.println("Comparing " + fileA + " and " + fileB); + + final XMLProperties xmlPropsA = new XMLProperties(fileA); + final XMLProperties xmlPropsB = new XMLProperties(fileB); + + // First, iterate through the UcdProperties on each codepoint. + for (final UcdProperty prop : UcdProperty.values()) { + UnicodeMap fileAMap = xmlPropsA.getMap(prop); + UnicodeMap fileBMap = xmlPropsB.getMap(prop); + if (!fileAMap.equals(fileBMap)) { + for (int i = 0; i <= 0x10ffff; ++i) { + try { + String xmlValA = fileAMap.get(i); + String xmlValB = fileBMap.get(i); + if (!Objects.equals(xmlValA, xmlValB)) { + // At least one string is != null and the strings are different, but we + // don't care if one + // is null and one is empty_string + // As far as we care, empty_string == null == "00000" + int lenA = + (xmlValA == null + ? 0 + : (xmlValA.equals("00000") ? 0 : xmlValA.length())); + int lenB = + (xmlValB == null + ? 0 + : (xmlValB.equals("00000") ? 0 : xmlValB.length())); + if (!(lenA == 0 && lenB == 0) + && !isKnownDifference(i, xmlValA, xmlValB)) { + errorCount++; + System.out.println( + "For UCDProperty " + + prop.name() + + " (" + + prop.getShortName() + + ") [" + + String.format("0x%04X", i) + + "], "); + System.out.println("\t" + fileA + " = " + xmlValA); + System.out.println("\t" + fileB + " = " + xmlValB); + } + } + } catch (Exception e) { + System.out.println("Exception thrown for " + String.format("0x%04X", i)); + System.out.println(e.getMessage()); + } + } + } + } + // Now handle anything that contains codepoint sequences. + for (UcdProperty prop : codepointSequenceProperties) { + UnicodeMap fileAMap = xmlPropsA.getMap(prop); + UnicodeMap fileBMap = xmlPropsB.getMap(prop); + UnicodeSet differences = fileAMap.keySet().addAll(fileBMap.keySet()); + for (String key : differences) { + try { + String xmlValA = fileAMap.get(key); + String xmlValB = fileBMap.get(key); + if (!Objects.equals(xmlValA, xmlValB)) { + // At least one string is != null and the strings are different, but we + // don't care if one + // is null and one is empty_string + // As far as we care, empty_string == null == "00000" + int lenA = + (xmlValA == null + ? 0 + : (xmlValA.equals("00000") ? 0 : xmlValA.length())); + int lenB = + (xmlValB == null + ? 0 + : (xmlValB.equals("00000") ? 0 : xmlValB.length())); + if (!(lenA == 0 && lenB == 0)) { + errorCount++; + System.out.println( + "For UCDProperty " + + prop.name() + + " (" + + prop.getShortName() + + ") [" + + key + + "], "); + System.out.println("\t" + fileA + " = " + xmlValA); + System.out.println("\t" + fileB + " = " + xmlValB); + } + } + } catch (Exception e) { + System.out.println("Exception thrown for " + String.format("0x%04X", key)); + System.out.println(e.getMessage()); + } + } + } + System.exit(errorCount); + } + + private static boolean isKnownDifference(int codepoint, String xmlValA, String xmlValB) { + if (knownDifferences.containsKey(codepoint)) { + String knownValue1 = knownDifferences.get(codepoint)[0]; + String knownValue2 = knownDifferences.get(codepoint)[1]; + return (knownValue1.equals(xmlValA) && knownValue2.equals(xmlValB)) + || (knownValue1.equals(xmlValB) && knownValue2.equals(xmlValA)); + } + return false; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java new file mode 100644 index 0000000000..f8a0dfa279 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -0,0 +1,1749 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.util.VersionInfo; +import java.io.*; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues.*; + +public class GeneratePropertyValues { + + private enum VALUESOUTPUTTYPE { + VALUE_PER_LINE, + ALPHABETICAL_GROUP, + NUMERICAL_GROUP, + MAX_LINE_LENGTH; + } + + private enum SCHEMA { + // Manual indicates a fragment file that is maintained manually rather than generated from + // this utility. + // Manual + NAMESPACE("namespace"), + // Manual + DATATYPES("datatypes"), + // Manual + START("start"), + BOOLEAN("boolean"), + // Manual + DESCRIPTION("description"), + // Manual + REPERTOIRE("repertoire"), + PROPERTIES("properties"), + TANGUT("tangut"), + NUSHU("nushu"), + EMOJI_DATA("emoji-data"), + // Manual + BLOCK("block"), + // Manual + NAMED_SEQUENCES("named-sequences"), + // Manual + NORMALIZATION_CORRECTIONS("normalization-corrections"), + // Manual + STANDARDIZED_VARIANTS("standardized-variants"), + // Manual + CJK_RADICALS("cjk-radicals"), + // Manual + EMOJI_SOURCES("emoji-sources"), + DO_NOT_EMIT("do-not-emit"); + + final String name; + + SCHEMA(String name) { + this.name = name; + } + + String getName() { + return this.name; + } + } + + private static final class TR38Details { + boolean isList; + String syntax; + + public TR38Details(boolean isList, String syntax) { + this.isList = isList; + this.syntax = syntax; + } + + public boolean isList() { + return isList; + } + + public String getSyntax() { + return syntax; + } + } + + private static final int MAX_LINE_LENGTH = 70; + private static final String NEWLINE = System.lineSeparator(); + private static final String DOUBLELINE = System.lineSeparator() + System.lineSeparator(); + private static final String TRIPLELINE = + System.lineSeparator() + System.lineSeparator() + System.lineSeparator(); + private static File destinationFolder = null; + + private static HashMap syntaxTR38; + private static final String NAMESPACE = "http://unicode.org/ns/2001/ucdxml"; + private static final String TR38URL = "https://www.unicode.org/reports/tr38"; + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("ucdversion", 'v', UOption.REQUIRES_ARG), + UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + }; + + private static final int HELP = 0, UCDVERSION = 1, OUTPUTFOLDER = 2; + + public static void main(String[] args) throws Exception { + + VersionInfo ucdVersion = null; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println( + "GeneratePropertyValuesList --ucdversion {version number} --outputfolder {destination}"); + System.exit(0); + } + + try { + if (options[UCDVERSION].doesOccur) { + try { + ucdVersion = VersionInfo.getInstance(options[UCDVERSION].value); + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[UCDVERSION].value + + " to a valid UCD version"); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --ucdversion (or -v)"); + } + if (options[OUTPUTFOLDER].doesOccur) { + try { + destinationFolder = new File(options[OUTPUTFOLDER].value); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdir()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + options[OUTPUTFOLDER].value); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --outputfolder (or -f)"); + } + + } catch (Exception e) { + System.err.println(e.getMessage()); + System.exit(1); + } + + if (ucdVersion != null && destinationFolder.exists()) { + buildPropertyValues(ucdVersion); + System.out.println("End"); + System.exit(0); + } else { + System.err.println("Unexpected error when building UcdXML file."); + System.exit(1); + } + } + + private static void buildPropertyValues( + // It would be nice to be able to generate values by ucdVersion. Leaving this here for + // now... + VersionInfo ucdVersion) throws IOException, URISyntaxException { + syntaxTR38 = parseTR38(); + + createPropertyFragment( + SCHEMA.BOOLEAN, + getFormattedValues(SCHEMA.BOOLEAN, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Age, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Age, VALUESOUTPUTTYPE.NUMERICAL_GROUP)); + createPropertyFragment( + UcdProperty.Name, SCHEMA.PROPERTIES, getFormattedSyntax(UcdProperty.Name)); + createPropertyFragment( + UcdProperty.Unicode_1_Name, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Unicode_1_Name)); + createPropertyFragment( + UcdProperty.Name_Alias.getShortName() + ".xml", + "name-alias element", + SCHEMA.PROPERTIES, + getFormattedElement(UcdProperty.Name_Alias)); + createPropertyFragment( + UcdProperty.Block, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Block, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.General_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.General_Category, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.Canonical_Combining_Class, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Canonical_Combining_Class, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Bidi_Class, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Bidi_Class, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.Bidi_Mirrored, + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Bidi_Mirrored)); + createPropertyFragment( + UcdProperty.Bidi_Mirroring_Glyph, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Bidi_Mirroring_Glyph)); + createPropertyFragment( + UcdProperty.Bidi_Control, + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Bidi_Control)); + createPropertyFragment( + UcdProperty.Bidi_Paired_Bracket_Type, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Bidi_Paired_Bracket_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Bidi_Paired_Bracket, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Bidi_Paired_Bracket)); + createPropertyFragment( + "decomposition.xml", + "decomposition properties", + SCHEMA.PROPERTIES, + getFormattedDecompositionProperties()); + createPropertyFragment( + "composition.xml", + "composition properties", + SCHEMA.PROPERTIES, + getFormattedCompositionProperties()); + createPropertyFragment( + "quickcheck.xml", + "quick check properties", + SCHEMA.PROPERTIES, + getFormattedQuickCheckProperties()); + createPropertyFragment( + "numeric.xml", + "numeric properties", + SCHEMA.PROPERTIES, + getFormattedNumericProperties()); + createPropertyFragment( + "joining.xml", + "joining properties", + SCHEMA.PROPERTIES, + getFormattedJoiningProperties()); + createPropertyFragment( + UcdProperty.Join_Control.getShortName() + ".xml", + "joining properties", + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Join_Control)); + createPropertyFragment( + UcdProperty.Line_Break, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Line_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.East_Asian_Width, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.East_Asian_Width, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + "casing.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCasingProperties()); + createPropertyFragment( + "simple_case_mapping.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedSimpleCaseMappingProperties()); + createPropertyFragment( + "case_mapping.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseMappingProperties()); + createPropertyFragment( + "case_folding.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseFoldingProperties()); + createPropertyFragment( + "case_other.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseOtherProperties()); + createPropertyFragment( + "script.xml", + "script properties", + SCHEMA.PROPERTIES, + getFormattedScriptProperties()); + createPropertyFragment( + UcdProperty.ISO_Comment, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.ISO_Comment)); + createPropertyFragment( + UcdProperty.Hangul_Syllable_Type, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Hangul_Syllable_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Jamo_Short_Name, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Jamo_Short_Name)); + createPropertyFragment( + UcdProperty.Indic_Syllabic_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Syllabic_Category, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Indic_Positional_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Positional_Category, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Indic_Conjunct_Break, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Conjunct_Break, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + "identifier.xml", + "identifier properties", + SCHEMA.PROPERTIES, + getFormattedIdentifierProperties()); + createPropertyFragment( + "pattern.xml", + "pattern properties", + SCHEMA.PROPERTIES, + getFormattedPatternProperties()); + createPropertyFragment( + "function_graphic.xml", + "properties related to function and graphic characteristics", + SCHEMA.PROPERTIES, + getFormattedFunctionGraphicProperties()); + createPropertyFragment( + "boundaries.xml", + "properties related to boundaries", + SCHEMA.PROPERTIES, + getFormattedBoundaryProperties()); + createPropertyFragment( + "ideographs.xml", + "properties related to ideographs", + SCHEMA.PROPERTIES, + getFormattedIdeographProperties()); + createPropertyFragment( + "miscellaneous.xml", + "miscellaneous properties", + SCHEMA.PROPERTIES, + getFormattedMiscellaneousProperties()); + createPropertyFragment( + "Unihan.xml", + "Unihan properties", + SCHEMA.PROPERTIES, + getFormattedUnihanProperties()); + createPropertyFragment( + "Tangut.xml", "Tangut data", SCHEMA.TANGUT, getFormattedTangutProperties()); + createPropertyFragment( + "Nushu.xml", "Nushu data", SCHEMA.NUSHU, getFormattedNushuProperties()); + createPropertyFragment( + "Emoji.xml", "Emoji properties", SCHEMA.EMOJI_DATA, getFormattedEmojiProperties()); + createPropertyFragment( + "do-not-emit.xml", + "do-not-emit", + SCHEMA.DO_NOT_EMIT, + getFormattedDoNotEmit(VALUESOUTPUTTYPE.VALUE_PER_LINE)); + } + + private static void createPropertyFragment(SCHEMA schema, String formattedFragment) + throws IOException { + createPropertyFragment( + schema.getName() + ".xml", schema.getName(), schema, formattedFragment); + } + + private static void createPropertyFragment( + UcdProperty ucdProperty, SCHEMA schema, String formattedFragment) throws IOException { + createPropertyFragment( + ucdProperty.getShortName() + ".xml", + ucdProperty.getShortName() + " attribute", + schema, + formattedFragment); + } + + private static void createPropertyFragment( + String filename, String title, SCHEMA schema, String formattedFragment) + throws IOException { + BufferedWriter writer = getFragmentWriter(filename); + writer.write( + "" + + NEWLINE + + "" + + NEWLINE); + writer.write(formattedFragment); + writer.write(NEWLINE + ""); + writer.flush(); + writer.close(); + } + + private static BufferedWriter getFragmentWriter(String filename) + throws IOException { + File fragmentFolder = + new File(destinationFolder + File.separator); + if (!fragmentFolder.exists()) { + if (!fragmentFolder.mkdir()) { + throw new IOException(); + } + } + File outputFile = new File(fragmentFolder, filename); + FileOutputStream fileOutputStream = new FileOutputStream(outputFile); + OutputStreamWriter outputStreamWriter = + new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); + return new BufferedWriter(outputStreamWriter); + } + + private static String getFormattedAttribute( + UcdProperty ucdProperty, VALUESOUTPUTTYPE valuesoutputtype) { + String attributeString = " attribute " + ucdProperty.getShortName() + " "; + List values; + StringBuilder stringBuilder = new StringBuilder(); + + switch (ucdProperty) { + case Age: + values = getAgeValues(); + break; + case Block: + values = getBlockValues(); + break; + case General_Category: + values = getGeneralCategoryValues(); + break; + case Canonical_Combining_Class: + values = getCanonicalCombiningClassValues(); + break; + case Bidi_Class: + values = getBidirectionalValues(); + break; + case Bidi_Paired_Bracket_Type: + values = getBidiPairedBracketTypeValues(); + break; + case Decomposition_Type: + values = getDecompositionTypeValues(); + break; + case NFC_Quick_Check: + values = getNFCQuickCheckValues(); + break; + case NFD_Quick_Check: + values = getNFDQuickCheckValues(); + break; + case NFKC_Quick_Check: + values = getNFKCQuickCheckValues(); + break; + case NFKD_Quick_Check: + values = getNFKDQuickCheckValues(); + break; + case Numeric_Type: + values = getNumericTypeValues(); + break; + case Joining_Type: + values = getJoiningTypeValues(); + break; + case Joining_Group: + values = getJoiningGroupValues(); + break; + case Line_Break: + values = getLineBreakValues(); + break; + case East_Asian_Width: + values = getEastAsianWidthValues(); + break; + case Hangul_Syllable_Type: + values = getHangulSyllableTypeValues(); + break; + case Indic_Syllabic_Category: + values = getIndicSyllabicCategoryValues(); + break; + case Indic_Positional_Category: + values = getIndicPositionalCategoryValues(); + break; + case Indic_Conjunct_Break: + values = getIndicConjunctBreakValues(); + break; + case Vertical_Orientation: + values = getVerticalOrientationValues(); + break; + case Grapheme_Cluster_Break: + values = getGraphemeClusterBreakValues(); + break; + case Word_Break: + values = getWordBreakValues(); + break; + case Sentence_Break: + values = getSentenceBreakValues(); + break; + case Do_Not_Emit_Type: + values = getDoNotEmitTypeValues(); + break; + + default: + throw new IllegalStateException( + ucdProperty.getShortName() + + " is not handled by " + + "getFormattedAttribute."); + } + String formattedValues = formatValues(attributeString.length(), values, valuesoutputtype); + stringBuilder + .append(" code-point-attributes &=") + .append(NEWLINE) + .append(attributeString) + .append("{ "); + if (formattedValues.contains(NEWLINE)) { + stringBuilder.append(formattedValues).append(NEWLINE); + stringBuilder.append( + String.format("%" + (attributeString.length() + "}?".length()) + "s", "}?")); + } else { + stringBuilder.append(formattedValues).append(" }?"); + } + return stringBuilder.toString(); + } + + private static String getFormattedSyntax(UcdProperty ucdProperty) { + final PropertyParsingInfo propInfo = PropertyParsingInfo.getPropertyInfo(ucdProperty); + if (propInfo.getRegex() == null) { + throw new NullPointerException( + "Could not find syntax for " + ucdProperty.getShortName()); + } + + String attributeString = + ucdProperty.getShortName().startsWith("cjk") + ? " attribute " + ucdProperty.getShortName().substring(2) + " " + : " attribute " + ucdProperty.getShortName() + " "; + String formattedAttributeString; + switch (ucdProperty) { + // { text } + case ISO_Comment: + formattedAttributeString = attributeString + "{ text }?"; + break; + + // { single-code-point } + case Equivalent_Unified_Ideograph: + formattedAttributeString = attributeString + "{ single-code-point }?"; + break; + + // { "" | single-code-point } + case Bidi_Mirroring_Glyph: + formattedAttributeString = attributeString + "{ \"\" | single-code-point }?"; + break; + + // { "#" | single-code-point } + case Bidi_Paired_Bracket: + case Simple_Uppercase_Mapping: + case Simple_Lowercase_Mapping: + case Simple_Titlecase_Mapping: + case Simple_Case_Folding: + formattedAttributeString = attributeString + "{ \"#\" | single-code-point }?"; + break; + + // { "#" | zero-or-more-code-points } + case Decomposition_Mapping: + case NFKC_Casefold: + case NFKC_Simple_Casefold: + formattedAttributeString = + attributeString + "{ \"#\" | zero-or-more-code-points }?"; + break; + + // { "#" | one-or-more-code-points } + case FC_NFKC_Closure: + case Uppercase_Mapping: + case Lowercase_Mapping: + case Titlecase_Mapping: + case Case_Folding: + formattedAttributeString = attributeString + "{ \"#\" | one-or-more-code-points }?"; + break; + + // { "NaN" | RegEx } + case Numeric_Value: + formattedAttributeString = + attributeString + + "{ \"NaN\" | xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + break; + + // Special cases + case Name: + formattedAttributeString = + attributeString + + "{ \"\" |" + + NEWLINE + + " \"CJK UNIFIED IDEOGRAPH-#\" |" + + NEWLINE + + " \"CJK COMPATIBILITY IDEOGRAPH-#\" |" + + NEWLINE + + " \"EGYPTIAN HIEROGLYPH-#\" |" + + NEWLINE + + " \"TANGUT IDEOGRAPH-#\" |" + + NEWLINE + + " \"KHITAN SMALL SCRIPT CHARACTER-#\" |" + + NEWLINE + + " \"NUSHU CHARACTER-#\" |" + + NEWLINE + + " xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" }" + + NEWLINE + + " }?"; + break; + case Unicode_1_Name: + formattedAttributeString = + attributeString + + "{ \"\" | xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + break; + case Script: + formattedAttributeString = attributeString + "{ script }?"; + break; + case Script_Extensions: + formattedAttributeString = attributeString + "{ list { script + } }?"; + break; + case kTGT_MergedSrc: + // Ideally, should be obtained from a TR. + String kTGT_MergedSrc = + NEWLINE + + " { xsd:string {pattern=\"L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?\"}" + + NEWLINE + + " | xsd:string {pattern=\"L2006-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L1997-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L1986-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"S1968-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"N1966-[0-9]{3}(-[0-9A-Z]{3,4})?\"}" + + NEWLINE + + " | xsd:string {pattern=\"H2004-[A-Z]-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L2012-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"UTN42-[0-9]{3}\"}" + + NEWLINE + + " }?"; + formattedAttributeString = attributeString + kTGT_MergedSrc; + break; + case kReading: + // Ideally, should be obtained from a TR. + String kReading = "{ xsd:string }?"; + formattedAttributeString = attributeString + kReading; + break; + + default: + formattedAttributeString = + attributeString + + "{ xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + } + return " code-point-attributes &=" + NEWLINE + formattedAttributeString; + } + + private static String getFormattedTR38Syntax(UcdProperty ucdProperty) { + // TODO: We should determine whether we still want to show empty values in the XML files. + // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() + boolean isShowIfEmpty = false; + for (UcdPropertyDetail propDetail : UcdPropertyDetail.cjkValues()) { + if (propDetail.getUcdProperty().equals(ucdProperty)) { + isShowIfEmpty = propDetail.isCJKShowIfEmpty(); + } + } + + String attributeString = " attribute " + ucdProperty.getShortName().substring(2); + TR38Details tr38Details = syntaxTR38.get(ucdProperty.name()); + if (tr38Details == null) { + throw new NullPointerException( + "Could not locate details for " + ucdProperty.name() + " in " + TR38URL); + } + String formattedSyntax = formatTR38Syntax(tr38Details, isShowIfEmpty); + + return " code-point-attributes &=" + attributeString + NEWLINE + formattedSyntax; + } + + private static String getFormattedElement(UcdProperty ucdProperty) { + // Currently scoped to UcdProperty.Name_Alias, but might need to handle different + // properties. + String nameAliasElement = "name-alias"; + List values = getNameAliasTypeValues(); + PropertyParsingInfo propInfo = PropertyParsingInfo.getPropertyInfo(ucdProperty); + + String elementString = " element " + nameAliasElement + " {" + NEWLINE; + String attributeAliasString = + " attribute alias { xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?," + + NEWLINE; + String attributeTypeString = " attribute type "; + + String formattedValues = + formatValues( + attributeTypeString.length(), values, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + + return " code-point-attributes &=" + + NEWLINE + + elementString + + attributeAliasString + + attributeTypeString + + "{ " + + formattedValues + + NEWLINE + + String.format( + "%" + (attributeTypeString.length() + "}? } *".length()) + "s", "}? } *"); + } + + private static String getFormattedBoolean(UcdProperty ucdProperty) { + String attributeString = " attribute " + ucdProperty.getShortName() + " "; + + return " code-point-attributes &=" + NEWLINE + attributeString + "{ boolean }?"; + } + + private static String getFormattedValues(SCHEMA schema, VALUESOUTPUTTYPE valuesoutputtype) { + List values = getBinaryValues(); + String formattedValues = formatValues(2, values, valuesoutputtype); + return " " + schema.getName() + " = " + formattedValues; + } + + private static String getFormattedPropertyValues( + UcdProperty ucdProperty, VALUESOUTPUTTYPE valuesoutputtype) { + List values = getScriptValues(); + String formattedValues = formatValues(11, values, valuesoutputtype); + return " " + ucdProperty.name().toLowerCase() + " = " + formattedValues; + } + + private static String getFormattedDoNotEmit(VALUESOUTPUTTYPE valuesoutputtype) { + List values = getDoNotEmitTypeValues(); + String formattedValues = formatValues(26, values, valuesoutputtype); + return " ucd.content &=\n" + + " element do-not-emit {\n" + + " element instead {\n" + + " attribute of { one-or-more-code-points },\n" + + " attribute use { one-or-more-code-points },\n" + + " attribute because { " + + formattedValues + + NEWLINE + + " } }+ }?"; + } + + private static String formatTR38Syntax(TR38Details tr38Details, boolean isShowIfEmpty) { + // TODO: We should determine whether we still want to show empty values in the XML files. + // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() + boolean isList = tr38Details.isList(); + String syntax = cleanRegex(tr38Details.getSyntax()); + // This is a kludge as it depends on only having single OR double quotes in the syntax. If + // we have both, we'll + // need to do more investigation on what RELAXNG Compact supports. + String QUOTMARK = syntax.contains("\"") ? "'" : "\""; + + boolean hasNewlines = syntax.contains("\n"); + if (hasNewlines) { + int indent; + String firstLinePrefix; + String ending = isList ? " )+}}?" : " }?"; + if (isShowIfEmpty) { + indent = (isList ? 15 : 8); + firstLinePrefix = isList ? " { \"\" | list { " : " { \"\" | "; + } else { + indent = (isList ? 12 : 4); + firstLinePrefix = isList ? " { list { ( " : " { "; + } + String padding = String.format("%" + indent + "s", ""); + StringBuilder formattedSyntaxBuilder = new StringBuilder(); + Pattern syntaxPattern = Pattern.compile("([^\r\n]+)"); + Matcher matcher = syntaxPattern.matcher(syntax); + while (matcher.find()) { + if (formattedSyntaxBuilder.length() == 0) { + // First line + formattedSyntaxBuilder + .append(firstLinePrefix) + .append("xsd:string { pattern=") + .append(QUOTMARK) + .append(matcher.group(1)) + .append(QUOTMARK) + .append(" }") + .append(NEWLINE); + } else { + // Everything else + formattedSyntaxBuilder + .append(padding) + .append( + matcher.group(1) + .replaceAll( + "^[| ]*", + " | xsd:string { pattern=" + QUOTMARK)) + .append(QUOTMARK) + .append(" }") + .append(NEWLINE); + } + } + formattedSyntaxBuilder.append(ending); + return formattedSyntaxBuilder.toString(); + + } else { + if (isShowIfEmpty) { + if (isList) { + return " { \"\" | list { xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " }+ } }?"; + } else { + return " { \"\" | xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " } }?"; + } + } else { + if (isList) { + return " { list { xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " }+ } }?"; + } else { + return " { xsd:string { pattern=" + QUOTMARK + syntax + QUOTMARK + " } }?"; + } + } + } + } + + private static String formatValues( + int indent, List values, VALUESOUTPUTTYPE valuesoutputtype) { + StringBuilder valueBlock = new StringBuilder(); + StringBuilder currentLine = new StringBuilder(); + String padding = String.format("%" + indent + "s", ""); + String groupPrefix = ""; + for (String value : values) { + StringBuilder formattedValue = new StringBuilder(); + if (valueBlock.length() > 0 || currentLine.length() > 0) { + formattedValue.append("| "); + } + if (value.startsWith("xsd")) { + formattedValue.append(value); + } else { + formattedValue.append("\"").append(value).append("\""); + } + + switch (valuesoutputtype) { + case NUMERICAL_GROUP: + case ALPHABETICAL_GROUP: + String valuePrefix = getValuePrefix(value, valuesoutputtype); + if (groupPrefix.isEmpty()) { + currentLine.append(formattedValue); + groupPrefix = valuePrefix; + } else if (valuePrefix.equals(groupPrefix)) { + int testLength = + valueBlock.length() == 0 + ? padding.length() + currentLine.length() + " ".length() + : currentLine.length() + " ".length(); + if ((testLength + formattedValue.length()) > MAX_LINE_LENGTH) { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + } else { + if (currentLine.length() > 0) { + currentLine.append(" "); + } + currentLine.append(formattedValue); + } + } else { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + groupPrefix = valuePrefix; + } + break; + + case MAX_LINE_LENGTH: + int testLength = + valueBlock.length() == 0 + ? padding.length() + currentLine.length() + " ".length() + : currentLine.length() + " ".length(); + if ((testLength + formattedValue.length()) > MAX_LINE_LENGTH) { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + } else { + if (currentLine.length() > 0) { + currentLine.append(" "); + } + currentLine.append(formattedValue); + } + break; + + case VALUE_PER_LINE: + default: + if (valueBlock.length() > 0) { + valueBlock.append(NEWLINE).append(padding).append("| "); + } + if (value.startsWith("xsd")) { + valueBlock.append(value); + } else { + valueBlock.append("\"").append(value).append("\""); + } + } + } + valueBlock.append(currentLine); + return valueBlock.toString(); + } + + private static String getValuePrefix(String value, VALUESOUTPUTTYPE valuesoutputtype) { + if (valuesoutputtype == VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) { + return value.substring(0, 1); + } + if (valuesoutputtype == VALUESOUTPUTTYPE.NUMERICAL_GROUP) { + if (value.contains(".")) { + return value.substring(0, value.indexOf(".")); + } else { + // String value in list of numbers. See Age_Values for an example. + return value; + } + } else { + throw new IllegalArgumentException(); + } + } + + private static String cleanRegex(String regex) { + return regex.replaceAll("\\[-", "[\\\\-").replaceAll("\\\\/", "/").replaceAll("\\\\'", "'"); + } + + // ********************* Combined properties ********************// + + private static String getFormattedDecompositionProperties() { + return getFormattedAttribute( + UcdProperty.Decomposition_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Decomposition_Mapping); + } + + private static String getFormattedCompositionProperties() { + return getFormattedBoolean(UcdProperty.Composition_Exclusion) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Full_Composition_Exclusion); + } + + private static String getFormattedQuickCheckProperties() { + return getFormattedAttribute(UcdProperty.NFC_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFD_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFKC_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFKD_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + TRIPLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFC) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFD) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFKC) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFKD) + + TRIPLELINE + + getFormattedSyntax(UcdProperty.FC_NFKC_Closure); + } + + private static String getFormattedNumericProperties() { + return getFormattedAttribute(UcdProperty.Numeric_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Numeric_Value); + } + + private static String getFormattedJoiningProperties() { + return getFormattedAttribute(UcdProperty.Joining_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Joining_Group, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + } + + private static String getFormattedCasingProperties() { + return getFormattedBoolean(UcdProperty.Uppercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Lowercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Uppercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Lowercase); + } + + private static String getFormattedSimpleCaseMappingProperties() { + return getFormattedSyntax(UcdProperty.Simple_Uppercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Simple_Lowercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Simple_Titlecase_Mapping); + } + + private static String getFormattedCaseMappingProperties() { + return getFormattedSyntax(UcdProperty.Uppercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Lowercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Titlecase_Mapping); + } + + private static String getFormattedCaseFoldingProperties() { + return getFormattedSyntax(UcdProperty.Simple_Case_Folding) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Case_Folding); + } + + private static String getFormattedCaseOtherProperties() { + return getFormattedBoolean(UcdProperty.Case_Ignorable) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Cased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Casefolded) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Casemapped) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Lowercased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_NFKC_Casefolded) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Titlecased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Uppercased) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.NFKC_Casefold) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.NFKC_Simple_Casefold); + } + + private static String getFormattedScriptProperties() { + return getFormattedPropertyValues(UcdProperty.Script, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Script) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Script_Extensions); + } + + private static String getFormattedIdentifierProperties() { + return getFormattedBoolean(UcdProperty.ID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_ID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.XID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_ID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.XID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Compat_Math_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Compat_Math_Continue); + } + + private static String getFormattedPatternProperties() { + return getFormattedBoolean(UcdProperty.Pattern_Syntax) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Pattern_White_Space); + } + + private static String getFormattedFunctionGraphicProperties() { + return getFormattedBoolean(UcdProperty.Dash) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Hyphen) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Quotation_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Terminal_Punctuation) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Sentence_Terminal) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Diacritic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Extender) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Soft_Dotted) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Alphabetic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Alphabetic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Math) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Math) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Hex_Digit) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ASCII_Hex_Digit) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Default_Ignorable_Code_Point) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Default_Ignorable_Code_Point) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Logical_Order_Exception) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Prepended_Concatenation_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Modifier_Combining_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.White_Space) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Vertical_Orientation, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Regional_Indicator); + } + + private static String getFormattedBoundaryProperties() { + return getFormattedBoolean(UcdProperty.Grapheme_Base) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Grapheme_Extend) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Grapheme_Extend) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Grapheme_Link) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Grapheme_Cluster_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedAttribute(UcdProperty.Word_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Sentence_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + } + + private static String getFormattedIdeographProperties() { + return getFormattedBoolean(UcdProperty.Ideographic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Unified_Ideograph) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Equivalent_Unified_Ideograph) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Binary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Trinary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Unary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Radical); + } + + private static String getFormattedMiscellaneousProperties() { + return getFormattedBoolean(UcdProperty.Deprecated) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Variation_Selector) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Noncharacter_Code_Point); + } + + private static String getFormattedUnihanProperties() { + return getFormattedTR38Syntax(UcdProperty.kAccountingNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kAlternateTotalStrokes) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kBigFive) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCangjie) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCantonese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCCCII) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCheungBauer) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCheungBauerIndex) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCihaiT) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCNS1986) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCNS1992) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCompatibilityVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCowles) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kDaeJaweon) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kDefinition) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kEACC) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFanqie) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFenn) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFennIndex) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFourCornerCode) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB0) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB3) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB5) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB7) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB8) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGradeLevel) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGSR) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHangul) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanYu) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanyuPinlu) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanyuPinyin) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHDZRadBreak) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHKGlyph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIBMJapan) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIICore) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_GSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_HSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_JSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_KPSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_KSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_MSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_SSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_TSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_UKSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_USource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_VSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGDaeJaweon) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGHanyuDaZidian) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGKangXi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJa) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapanese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapaneseKun) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapaneseOn) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJinmeiyoKanji) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJis0) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJis1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJIS0213) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJoyoKanji) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKangXi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKarlgren) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKorean) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKoreanEducationHanja) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKoreanName) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kLau) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMainlandTelegraph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMandarin) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMatthews) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMeyerWempe) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMojiJoho) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMorohashi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kNelson) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kOtherNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPhonetic) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPrimaryNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPseudoGB1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kRSAdobe_Japan1_6) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kRSUnicode) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSBGY) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSemanticVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSimplifiedVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSMSZD2003Index) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSMSZD2003Readings) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSpecializedSemanticVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSpoofingVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kStrange) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTaiwanTelegraph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTang) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTGH) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTGHZ2013) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTotalStrokes) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTraditionalVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kUnihanCore2020) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kVietnamese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kVietnameseNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kXerox) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kXHC1983) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZhuang) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZhuangNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZVariant); + } + + private static String getFormattedTangutProperties() { + return getFormattedSyntax(UcdProperty.kRSTUnicode) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.kTGT_MergedSrc); + } + + private static String getFormattedNushuProperties() { + return getFormattedSyntax(UcdProperty.kSrc_NushuDuben) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.kReading); + } + + private static String getFormattedEmojiProperties() { + return getFormattedBoolean(UcdProperty.Emoji) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Presentation) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Modifier) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Modifier_Base) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Component) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Extended_Pictographic); + } + + // ********************* Attribute values ********************// + + private static List getBinaryValues() { + List values = new ArrayList<>(); + for (Binary binaryValues : Binary.values()) { + values.add(binaryValues.getShortName()); + } + // Binary should display as Y | N. + values.sort(Collections.reverseOrder()); + return values; + } + + private static List getAgeValues() { + List values = new ArrayList<>(); + for (Age_Values ageValues : Age_Values.values()) { + String shortName = ageValues.getShortName(); + if (shortName.equals("NA")) { + values.add("unassigned"); + } else if (shortName.equals("13.1")) { + // https://github.com/unicode-org/unicodetools/issues/100 + } else { + values.add(shortName); + } + } + return values; + } + + private static List getNameAliasTypeValues() { + List values = new ArrayList<>(); + for (AttributeResolver.AliasType aliastypeValues : AttributeResolver.AliasType.values()) { + if (!aliastypeValues.equals(AttributeResolver.AliasType.NONE)) { + values.add(aliastypeValues.toString()); + } + } + return values; + } + + private static List getBlockValues() { + List values = new ArrayList<>(); + for (Block_Values blockValues : Block_Values.values()) { + values.add(blockValues.getShortName()); + } + return values; + } + + private static List getGeneralCategoryValues() { + List values = new ArrayList<>(); + for (General_Category_Values generalCategoryValues : General_Category_Values.values()) { + if (!generalCategoryValues + .getShortName() + .toUpperCase() + .equals(generalCategoryValues.getShortName())) { + // Some of the General_Category_Values (LC, L, M, N, P, S, Z, C) stand for grouping + // of related + // General_Category values. They won't occur on any individual code point, so can be + // ignored. + values.add(generalCategoryValues.getShortName()); + } + } + return values; + } + + private static List getCanonicalCombiningClassValues() { + List values = new ArrayList<>(); + values.add("xsd:integer { minInclusive=\"0\" maxInclusive=\"254\" }"); + // Because the set of values that this property has taken across the various versions of the + // UCD is rather + // large, our schema does not restrict the possible values to those actually used. + // for (Canonical_Combining_Class_Values canonicalCombiningClassValues : + // Canonical_Combining_Class_Values.values()) { + // values.add(canonicalCombiningClassValues.getShortName()); + // } + return values; + } + + private static List getBidirectionalValues() { + List values = new ArrayList<>(); + for (Bidi_Class_Values bidiClassValues : Bidi_Class_Values.values()) { + values.add(bidiClassValues.getShortName()); + } + return values; + } + + private static List getBidiPairedBracketTypeValues() { + List values = new ArrayList<>(); + // Order should be Open/Close/None + values.add(Bidi_Paired_Bracket_Type_Values.Open.getShortName()); + values.add(Bidi_Paired_Bracket_Type_Values.Close.getShortName()); + values.add(Bidi_Paired_Bracket_Type_Values.None.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (Bidi_Paired_Bracket_Type_Values bidiPairedBracketTypeValue : + Bidi_Paired_Bracket_Type_Values.values()) { + if (!values.contains(bidiPairedBracketTypeValue.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getDecompositionTypeValues() { + List values = new ArrayList<>(); + for (Decomposition_Type_Values decompositionTypeValues : + Decomposition_Type_Values.values()) { + // We want "none" to be last. + if (decompositionTypeValues != Decomposition_Type_Values.None) { + values.add(decompositionTypeValues.getNames().getOtherNames().get(0)); + } + } + values.add(Decomposition_Type_Values.None.getNames().getOtherNames().get(0)); + return values; + } + + private static List getNFCQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No/Maybe + values.add(NFC_Quick_Check_Values.Yes.getShortName()); + values.add(NFC_Quick_Check_Values.No.getShortName()); + values.add(NFC_Quick_Check_Values.Maybe.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFC_Quick_Check_Values nfcQuickCheckValues : NFC_Quick_Check_Values.values()) { + if (!values.contains(nfcQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFDQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No + values.add(NFD_Quick_Check_Values.Yes.getShortName()); + values.add(NFD_Quick_Check_Values.No.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFD_Quick_Check_Values nfdQuickCheckValues : NFD_Quick_Check_Values.values()) { + if (!values.contains(nfdQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFKCQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No/Maybe + values.add(NFKC_Quick_Check_Values.Yes.getShortName()); + values.add(NFKC_Quick_Check_Values.No.getShortName()); + values.add(NFKC_Quick_Check_Values.Maybe.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFKC_Quick_Check_Values nfkcQuickCheckValues : NFKC_Quick_Check_Values.values()) { + if (!values.contains(nfkcQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFKDQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No + values.add(NFKD_Quick_Check_Values.Yes.getShortName()); + values.add(NFKD_Quick_Check_Values.No.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFKD_Quick_Check_Values nfkdQuickCheckValues : NFKD_Quick_Check_Values.values()) { + if (!values.contains(nfkdQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNumericTypeValues() { + List values = new ArrayList<>(); + // Order should be Decimal/Digit/Numeric/None + values.add(Numeric_Type_Values.Decimal.getShortName()); + values.add(Numeric_Type_Values.Digit.getShortName()); + values.add(Numeric_Type_Values.Numeric.getShortName()); + values.add(Numeric_Type_Values.None.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (Numeric_Type_Values numericTypeValues : Numeric_Type_Values.values()) { + if (!values.contains(numericTypeValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getJoiningTypeValues() { + List values = new ArrayList<>(); + for (Joining_Type_Values joiningTypeValues : Joining_Type_Values.values()) { + values.add(joiningTypeValues.getShortName()); + } + return values; + } + + private static List getJoiningGroupValues() { + List values = new ArrayList<>(); + for (Joining_Group_Values joiningGroupValues : Joining_Group_Values.values()) { + values.add(joiningGroupValues.getShortName()); + } + return values; + } + + private static List getLineBreakValues() { + List values = new ArrayList<>(); + for (Line_Break_Values lineBreakValues : Line_Break_Values.values()) { + values.add(lineBreakValues.getShortName()); + } + return values; + } + + private static List getEastAsianWidthValues() { + List values = new ArrayList<>(); + for (East_Asian_Width_Values eastAsianWidthValues : East_Asian_Width_Values.values()) { + values.add(eastAsianWidthValues.getShortName()); + } + return values; + } + + private static List getScriptValues() { + List excludedValues = + Arrays.asList( + Script_Values.Han_with_Bopomofo, + Script_Values.Japanese, + Script_Values.Korean, + Script_Values.Math_Symbols, + Script_Values.Emoji_Symbols, + Script_Values.Other_Symbols, + Script_Values.Unwritten); + List values = new ArrayList<>(); + for (Script_Values scriptValue : Script_Values.values()) { + if (!excludedValues.contains(scriptValue)) { + values.add(scriptValue.getShortName()); + } + // Include the following if you want to add other names + // if (!scriptValue.getNames().getOtherNames().isEmpty()) { + // values.add(scriptValue.getNames().getOtherNames().get(0)); + // } + } + Collections.sort(values); + return values; + } + + private static List getHangulSyllableTypeValues() { + List values = new ArrayList<>(); + for (Hangul_Syllable_Type_Values hangulSyllableTypeValues : + Hangul_Syllable_Type_Values.values()) { + values.add(hangulSyllableTypeValues.getShortName()); + } + return values; + } + + private static List getIndicSyllabicCategoryValues() { + List values = new ArrayList<>(); + for (Indic_Syllabic_Category_Values indicSyllabicCategoryValues : + Indic_Syllabic_Category_Values.values()) { + values.add(indicSyllabicCategoryValues.getShortName()); + } + return values; + } + + private static List getIndicPositionalCategoryValues() { + List values = new ArrayList<>(); + for (Indic_Positional_Category_Values indicPositionalCategoryValues : + Indic_Positional_Category_Values.values()) { + values.add(indicPositionalCategoryValues.getShortName()); + } + return values; + } + + private static List getIndicConjunctBreakValues() { + List values = new ArrayList<>(); + for (Indic_Conjunct_Break_Values indicConjunctBreakValues : + Indic_Conjunct_Break_Values.values()) { + values.add(indicConjunctBreakValues.getShortName()); + } + return values; + } + + private static List getVerticalOrientationValues() { + List values = new ArrayList<>(); + for (Vertical_Orientation_Values verticalOrientationValues : + Vertical_Orientation_Values.values()) { + values.add(verticalOrientationValues.getShortName()); + } + return values; + } + + private static List getGraphemeClusterBreakValues() { + List values = new ArrayList<>(); + for (Grapheme_Cluster_Break_Values graphemeClusterBreakValues : + Grapheme_Cluster_Break_Values.values()) { + values.add(graphemeClusterBreakValues.getShortName()); + } + return values; + } + + private static List getWordBreakValues() { + List values = new ArrayList<>(); + for (Word_Break_Values wordBreakValues : Word_Break_Values.values()) { + values.add(wordBreakValues.getShortName()); + } + return values; + } + + private static List getSentenceBreakValues() { + List values = new ArrayList<>(); + for (Sentence_Break_Values sentenceBreakValues : Sentence_Break_Values.values()) { + values.add(sentenceBreakValues.getShortName()); + } + return values; + } + + private static List getDoNotEmitTypeValues() { + List values = new ArrayList<>(); + for (Do_Not_Emit_Type_Values doNotEmitTypeValues : Do_Not_Emit_Type_Values.values()) { + values.add(doNotEmitTypeValues.getShortName()); + } + Collections.sort(values); + return values; + } + + // ********************* Utility methods ********************// + + private static HashMap parseTR38() throws IOException, URISyntaxException { + HashMap syntaxTR38 = new HashMap<>(); + URI uri = new URI(TR38URL); + StringBuilder stringBuilder = new StringBuilder(); + try (InputStream is = uri.toURL().openStream()) { + int ptr = 0; + while ((ptr = is.read()) != -1) { + stringBuilder.append((char) ptr); + } + } + Pattern syntaxPattern = + Pattern.compile( + ">Property.*?(.*?).*?>Delimiter.*?>(.*?).*?>Syntax.*?>(.*?)", + Pattern.DOTALL); + Matcher matcher = syntaxPattern.matcher(stringBuilder.toString()); + while (matcher.find()) { + String delimiter = matcher.group(2).trim(); + boolean isList = false; + switch (delimiter) { + case "N/A": + break; + case "space": + isList = true; + break; + default: + throw new IllegalArgumentException( + "Only \"space\" or \"N/A\" are supported values for Delimiter." + + " Found: " + + delimiter); + } + TR38Details tr38Details = + new TR38Details(isList, matcher.group(3).trim().replaceAll("
", "")); + syntaxTR38.put(matcher.group(1).trim(), tr38Details); + } + return syntaxTR38; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java new file mode 100644 index 0000000000..a30067bbb6 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java @@ -0,0 +1,210 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import java.util.*; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdLineParser; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class UCDDataResolver { + + private final IndexUnicodeProperties indexUnicodeProperties; + private final String namespace; + private final UCDXMLWriter writer; + + public UCDDataResolver(IndexUnicodeProperties iup, String namespace, UCDXMLWriter writer) { + indexUnicodeProperties = iup; + this.namespace = namespace; + this.writer = writer; + } + + public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXException { + VersionInfo minVersion = ucdSection.getMinVersion(); + VersionInfo maxVersion = ucdSection.getMaxVersion(); + String tag = ucdSection.toString(); + String childTag = ucdSection.getChildTag(); + boolean parserWithRange = ucdSection.getParserWithRange(); + boolean parserWithMissing = ucdSection.getParserWithMissing(); + UcdSectionComponent[] ucdSectionComponents = + ucdSection.getUcdSectionDetail().getUcdSectionComponents(); + + if (isCompatibleVersion(minVersion, maxVersion)) { + writer.startElement(tag); + { + for (UcdSectionComponent ucdSectionComponent : ucdSectionComponents) { + if (isCompatibleVersion( + ucdSectionComponent.getMinVersion(), + ucdSectionComponent.getMaxVersion())) { + final PropertyParsingInfo fileInfoEVS = + PropertyParsingInfo.getPropertyInfo( + ucdSectionComponent.getUcdProperty()); + String fullFilename = + fileInfoEVS.getFullFileName(indexUnicodeProperties.getUcdVersion()); + UcdLineParser parser = + new UcdLineParser(FileUtilities.in("", fullFilename)); + parser.withRange(parserWithRange); + parser.withMissing(parserWithMissing); + switch (ucdSection) { + case BLOCKS: + for (UcdLineParser.UcdLine line : parser) { + if (!line.getOriginalLine().startsWith("#")) { + AttributesImpl attributes = + getBlockAttributes(namespace, line); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + } + break; + case NAMEDSEQUENCES: + HashMap namedSequences = new HashMap<>(); + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + namedSequences.put(parts[0], parts[1]); + } + List names = new ArrayList<>(namedSequences.keySet()); + Collections.sort(names); + for (String name : names) { + AttributesImpl attributes = + getNamedSequenceAttributes( + namespace, name, namedSequences); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + break; + case PROVISIONALNAMEDSEQUENCES: + HashMap provisionalNamedSequences = new HashMap<>(); + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + provisionalNamedSequences.put(parts[0], parts[1]); + } + List psNames = + new ArrayList<>(provisionalNamedSequences.keySet()); + Collections.sort(psNames); + for (String name : psNames) { + AttributesImpl attributes = + getNamedSequenceAttributes( + namespace, name, provisionalNamedSequences); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + break; + default: + for (UcdLineParser.UcdLine line : parser) { + AttributesImpl attributes = + getAttributes(ucdSection, namespace, line); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + } + } + } + writer.endElement(tag); + } + } + } + + private AttributesImpl getAttributes( + UcdSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { + switch (ucdSection) { + case CJKRADICALS: + return getCJKRadicalAttributes(namespace, line); + case DONOTEMIT: + return getDoNotEmitAttributes(namespace, line); + case EMOJISOURCES: + return getEmojiSourceAttributes(namespace, line); + case NORMALIZATIONCORRECTIONS: + return getNCAttributes(namespace, line); + case STANDARDIZEDVARIANTS: + return getSVAttributes(namespace, line); + default: + throw new IllegalArgumentException( + "getAttributes failed on an unexpected UcdSection"); + } + } + + private static AttributesImpl getBlockAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + String[] range = parts[0].split("\\.\\."); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "first-cp", "first-cp", "CDATA", range[0]); + attributes.addAttribute(namespace, "last-cp", "last-cp", "CDATA", range[1]); + attributes.addAttribute(namespace, "name", "name", "CDATA", parts[1]); + return attributes; + } + + private static AttributesImpl getCJKRadicalAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "number", "number", "CDATA", parts[0]); + attributes.addAttribute(namespace, "radical", "radical", "CDATA", parts[1]); + attributes.addAttribute(namespace, "ideograph", "ideograph", "CDATA", parts[2]); + return attributes; + } + + private static AttributesImpl getDoNotEmitAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "of", "of", "CDATA", parts[0]); + attributes.addAttribute(namespace, "use", "use", "CDATA", parts[1]); + attributes.addAttribute(namespace, "because", "because", "CDATA", parts[2]); + return attributes; + } + + private static AttributesImpl getEmojiSourceAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "unicode", "unicode", "CDATA", parts[0]); + attributes.addAttribute(namespace, "docomo", "docomo", "CDATA", parts[1]); + attributes.addAttribute(namespace, "kddi", "kddi", "CDATA", parts[2]); + attributes.addAttribute(namespace, "softbank", "softbank", "CDATA", parts[3]); + return attributes; + } + + private static AttributesImpl getNamedSequenceAttributes( + String namespace, String name, HashMap namedSequences) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "name", "name", "CDATA", name); + attributes.addAttribute(namespace, "cps", "cps", "CDATA", namedSequences.get(name)); + return attributes; + } + + private static AttributesImpl getNCAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "cp", "cp", "CDATA", parts[0]); + attributes.addAttribute(namespace, "old", "old", "CDATA", parts[1]); + attributes.addAttribute(namespace, "new", "new", "CDATA", parts[2]); + attributes.addAttribute(namespace, "version", "version", "CDATA", parts[3]); + return attributes; + } + + private static AttributesImpl getSVAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "cps", "cps", "CDATA", parts[0]); + attributes.addAttribute(namespace, "desc", "desc", "CDATA", parts[1]); + attributes.addAttribute( + namespace, "when", "when", "CDATA", parts[2] != null ? parts[2] : ""); + return attributes; + } + + private boolean isCompatibleVersion(VersionInfo minVersion, VersionInfo maxVersion) { + return (indexUnicodeProperties.getUcdVersion().compareTo(minVersion) >= 0 + && (maxVersion == null + || indexUnicodeProperties.getUcdVersion().compareTo(maxVersion) <= 0)); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java new file mode 100644 index 0000000000..ff31e69c61 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java @@ -0,0 +1,74 @@ +package org.unicode.xml; + +import java.io.FileOutputStream; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class UCDXMLWriter { + + public static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; + + private final TransformerHandler transformerHandler; + + public TransformerHandler getTransformerHandler() { + return transformerHandler; + } + + public UCDXMLWriter(FileOutputStream f) throws TransformerConfigurationException { + TransformerFactory tfactory = TransformerFactory.newInstance(); + SAXTransformerFactory sfactory = (SAXTransformerFactory) tfactory; + transformerHandler = sfactory.newTransformerHandler(); + Transformer transformer = transformerHandler.getTransformer(); + transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); + transformer.setOutputProperty(OutputKeys.METHOD, "xml"); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty(OutputKeys.STANDALONE, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "3"); + transformerHandler.setResult(new StreamResult(f)); + } + + public void startFile() throws SAXException { + transformerHandler.startDocument(); + char[] c = "\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + // TODO: JRW change hardcoded 2023 to current year. + c = " \u00A9 2023 Unicode\u00AE, Inc. ".toCharArray(); + transformerHandler.comment(c, 0, c.length); + c = "\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + c = " For terms of use, see http://www.unicode.org/terms_of_use.html ".toCharArray(); + transformerHandler.comment(c, 0, c.length); + c = "\n\n\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + } + + public void endFile() throws SAXException { + transformerHandler.endDocument(); + } + + public void startElement(String tagName) throws SAXException { + AttributesImpl attributes = new AttributesImpl(); + startElement(tagName, attributes); + } + + public void startElement(String tagName, AttributesImpl attributes) throws SAXException { + transformerHandler.startElement(NAMESPACE, tagName, tagName, attributes); + } + + public void addContent(String s) throws SAXException { + char[] d = s.toCharArray(); + transformerHandler.characters(d, 0, d.length); + } + + public void endElement(String tagName) throws SAXException { + transformerHandler.endElement(NAMESPACE, tagName, tagName); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java b/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java new file mode 100644 index 0000000000..a97ef5bab9 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java @@ -0,0 +1,2356 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import java.util.LinkedHashSet; +import java.util.Set; +import org.unicode.props.UcdProperty; + +public class UcdPropertyDetail { + + private static LinkedHashSet basePropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet cjkPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet ucdxmlPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet allPropertyDetails = + new LinkedHashSet(); + + public static UcdPropertyDetail Age_Detail = + new UcdPropertyDetail( + UcdProperty.Age, VersionInfo.getInstance(3, 2, 0), 1, true, false, false, true); + public static UcdPropertyDetail Name_Detail = + new UcdPropertyDetail( + UcdProperty.Name, + VersionInfo.getInstance(1, 1, 0), + 2, + true, + false, + false, + true); + public static UcdPropertyDetail Jamo_Short_Name_Detail = + new UcdPropertyDetail( + UcdProperty.Jamo_Short_Name, + VersionInfo.getInstance(5, 1, 0), + 3, + true, + false, + false, + true); + public static UcdPropertyDetail General_Category_Detail = + new UcdPropertyDetail( + UcdProperty.General_Category, + VersionInfo.getInstance(1, 1, 0), + 4, + true, + false, + false, + true); + public static UcdPropertyDetail Canonical_Combining_Class_Detail = + new UcdPropertyDetail( + UcdProperty.Canonical_Combining_Class, + VersionInfo.getInstance(1, 1, 0), + 5, + true, + false, + false, + true); + public static UcdPropertyDetail Decomposition_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Decomposition_Type, + VersionInfo.getInstance(1, 1, 0), + 6, + true, + false, + false, + true); + public static UcdPropertyDetail Decomposition_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Decomposition_Mapping, + VersionInfo.getInstance(1, 1, 0), + 7, + true, + false, + false, + true); + public static UcdPropertyDetail Numeric_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Numeric_Type, + VersionInfo.getInstance(1, 1, 0), + 8, + true, + false, + false, + true); + public static UcdPropertyDetail Numeric_Value_Detail = + new UcdPropertyDetail( + UcdProperty.Numeric_Value, + VersionInfo.getInstance(1, 1, 0), + 9, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Class_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Class, + VersionInfo.getInstance(1, 1, 0), + 10, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Paired_Bracket_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Paired_Bracket_Type, + VersionInfo.getInstance(6, 3, 0), + 11, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Paired_Bracket_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Paired_Bracket, + VersionInfo.getInstance(6, 3, 0), + 12, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Mirrored_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Mirrored, + VersionInfo.getInstance(1, 1, 0), + 13, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Mirroring_Glyph_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Mirroring_Glyph, + VersionInfo.getInstance(3, 0, 1), + 14, + true, + false, + false, + true); + public static UcdPropertyDetail Simple_Uppercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Uppercase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 15, + true, + false, + false, + true); + public static UcdPropertyDetail Simple_Lowercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Lowercase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 16, + true, + false, + false, + true); + public static UcdPropertyDetail Simple_Titlecase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Titlecase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 17, + true, + false, + false, + true); + public static UcdPropertyDetail Uppercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Uppercase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 18, + true, + false, + false, + true); + public static UcdPropertyDetail Lowercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Lowercase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 19, + true, + false, + false, + true); + public static UcdPropertyDetail Titlecase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Titlecase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 20, + true, + false, + false, + true); + // public static UcdPropertyDetail Special_Case_Condition_Detail = new UcdPropertyDetail + // ( + // UcdProperty.Special_Case_Condition, VersionInfo.getInstance(1,1,0), 21, + // true, false, false, true); + public static UcdPropertyDetail Simple_Case_Folding_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Case_Folding, + VersionInfo.getInstance(3, 0, 1), + 22, + true, + false, + false, + true); + public static UcdPropertyDetail Case_Folding_Detail = + new UcdPropertyDetail( + UcdProperty.Case_Folding, + VersionInfo.getInstance(3, 0, 1), + 23, + true, + false, + false, + true); + public static UcdPropertyDetail Joining_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Joining_Type, + VersionInfo.getInstance(2, 0, 0), + 24, + true, + false, + false, + true); + public static UcdPropertyDetail Joining_Group_Detail = + new UcdPropertyDetail( + UcdProperty.Joining_Group, + VersionInfo.getInstance(2, 0, 0), + 25, + true, + false, + false, + true); + public static UcdPropertyDetail East_Asian_Width_Detail = + new UcdPropertyDetail( + UcdProperty.East_Asian_Width, + VersionInfo.getInstance(3, 0, 0), + 26, + true, + false, + false, + true); + public static UcdPropertyDetail Line_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Line_Break, + VersionInfo.getInstance(3, 0, 0), + 27, + true, + false, + false, + true); + public static UcdPropertyDetail Script_Detail = + new UcdPropertyDetail( + UcdProperty.Script, + VersionInfo.getInstance(3, 1, 0), + 28, + true, + false, + false, + true); + public static UcdPropertyDetail Script_Extensions_Detail = + new UcdPropertyDetail( + UcdProperty.Script_Extensions, + VersionInfo.getInstance(6, 1, 0), + 29, + true, + false, + false, + true); + public static UcdPropertyDetail Dash_Detail = + new UcdPropertyDetail( + UcdProperty.Dash, + VersionInfo.getInstance(2, 0, 0), + 30, + true, + false, + false, + true); + public static UcdPropertyDetail White_Space_Detail = + new UcdPropertyDetail( + UcdProperty.White_Space, + VersionInfo.getInstance(2, 0, 0), + 31, + true, + false, + false, + true); + public static UcdPropertyDetail Hyphen_Detail = + new UcdPropertyDetail( + UcdProperty.Hyphen, + VersionInfo.getInstance(2, 0, 0), + 32, + true, + false, + false, + true); + public static UcdPropertyDetail Quotation_Mark_Detail = + new UcdPropertyDetail( + UcdProperty.Quotation_Mark, + VersionInfo.getInstance(2, 0, 0), + 33, + true, + false, + false, + true); + public static UcdPropertyDetail Radical_Detail = + new UcdPropertyDetail( + UcdProperty.Radical, + VersionInfo.getInstance(3, 2, 0), + 34, + true, + false, + false, + true); + public static UcdPropertyDetail Ideographic_Detail = + new UcdPropertyDetail( + UcdProperty.Ideographic, + VersionInfo.getInstance(2, 0, 0), + 35, + true, + false, + false, + true); + public static UcdPropertyDetail Unified_Ideograph_Detail = + new UcdPropertyDetail( + UcdProperty.Unified_Ideograph, + VersionInfo.getInstance(3, 2, 0), + 36, + true, + false, + false, + true); + public static UcdPropertyDetail IDS_Binary_Operator_Detail = + new UcdPropertyDetail( + UcdProperty.IDS_Binary_Operator, + VersionInfo.getInstance(3, 2, 0), + 37, + true, + false, + false, + true); + public static UcdPropertyDetail IDS_Trinary_Operator_Detail = + new UcdPropertyDetail( + UcdProperty.IDS_Trinary_Operator, + VersionInfo.getInstance(3, 2, 0), + 38, + true, + false, + false, + true); + public static UcdPropertyDetail Hangul_Syllable_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Hangul_Syllable_Type, + VersionInfo.getInstance(4, 0, 0), + 39, + true, + false, + false, + true); + public static UcdPropertyDetail Default_Ignorable_Code_Point_Detail = + new UcdPropertyDetail( + UcdProperty.Default_Ignorable_Code_Point, + VersionInfo.getInstance(3, 2, 0), + 40, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Default_Ignorable_Code_Point_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Default_Ignorable_Code_Point, + VersionInfo.getInstance(3, 2, 0), + 41, + true, + false, + false, + true); + public static UcdPropertyDetail Alphabetic_Detail = + new UcdPropertyDetail( + UcdProperty.Alphabetic, + VersionInfo.getInstance(1, 1, 0), + 42, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Alphabetic_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Alphabetic, + VersionInfo.getInstance(3, 1, 0), + 43, + true, + false, + false, + true); + public static UcdPropertyDetail Uppercase_Detail = + new UcdPropertyDetail( + UcdProperty.Uppercase, + VersionInfo.getInstance(3, 1, 0), + 44, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Uppercase_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Uppercase, + VersionInfo.getInstance(3, 1, 0), + 45, + true, + false, + false, + true); + public static UcdPropertyDetail Lowercase_Detail = + new UcdPropertyDetail( + UcdProperty.Lowercase, + VersionInfo.getInstance(3, 1, 0), + 46, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Lowercase_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Lowercase, + VersionInfo.getInstance(3, 1, 0), + 47, + true, + false, + false, + true); + public static UcdPropertyDetail Math_Detail = + new UcdPropertyDetail( + UcdProperty.Math, + VersionInfo.getInstance(2, 0, 0), + 48, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Math_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Math, + VersionInfo.getInstance(3, 1, 0), + 49, + true, + false, + false, + true); + public static UcdPropertyDetail Hex_Digit_Detail = + new UcdPropertyDetail( + UcdProperty.Hex_Digit, + VersionInfo.getInstance(2, 0, 0), + 50, + true, + false, + false, + true); + public static UcdPropertyDetail ASCII_Hex_Digit_Detail = + new UcdPropertyDetail( + UcdProperty.ASCII_Hex_Digit, + VersionInfo.getInstance(3, 1, 1), + 51, + true, + false, + false, + true); + public static UcdPropertyDetail Noncharacter_Code_Point_Detail = + new UcdPropertyDetail( + UcdProperty.Noncharacter_Code_Point, + VersionInfo.getInstance(3, 0, 1), + 52, + true, + false, + false, + true); + public static UcdPropertyDetail Variation_Selector_Detail = + new UcdPropertyDetail( + UcdProperty.Variation_Selector, + VersionInfo.getInstance(4, 0, 1), + 53, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Control_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Control, + VersionInfo.getInstance(2, 0, 0), + 54, + true, + false, + false, + true); + public static UcdPropertyDetail Join_Control_Detail = + new UcdPropertyDetail( + UcdProperty.Join_Control, + VersionInfo.getInstance(2, 0, 0), + 55, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Base_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Base, + VersionInfo.getInstance(3, 2, 0), + 56, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Extend_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Extend, + VersionInfo.getInstance(3, 2, 0), + 57, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Grapheme_Extend_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Grapheme_Extend, + VersionInfo.getInstance(3, 2, 0), + 58, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Link_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Link, + VersionInfo.getInstance(3, 2, 0), + 59, + true, + false, + false, + true); + public static UcdPropertyDetail Sentence_Terminal_Detail = + new UcdPropertyDetail( + UcdProperty.Sentence_Terminal, + VersionInfo.getInstance(9, 0, 0), + 60, + true, + false, + false, + true); + public static UcdPropertyDetail Extender_Detail = + new UcdPropertyDetail( + UcdProperty.Extender, + VersionInfo.getInstance(2, 0, 0), + 61, + true, + false, + false, + true); + public static UcdPropertyDetail Terminal_Punctuation_Detail = + new UcdPropertyDetail( + UcdProperty.Terminal_Punctuation, + VersionInfo.getInstance(2, 0, 0), + 62, + true, + false, + false, + true); + public static UcdPropertyDetail Diacritic_Detail = + new UcdPropertyDetail( + UcdProperty.Diacritic, + VersionInfo.getInstance(2, 0, 0), + 63, + true, + false, + false, + true); + public static UcdPropertyDetail Deprecated_Detail = + new UcdPropertyDetail( + UcdProperty.Deprecated, + VersionInfo.getInstance(3, 2, 0), + 64, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Start_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Start, + VersionInfo.getInstance(3, 1, 0), + 65, + true, + false, + false, + true); + public static UcdPropertyDetail Other_ID_Start_Detail = + new UcdPropertyDetail( + UcdProperty.Other_ID_Start, + VersionInfo.getInstance(4, 0, 0), + 66, + true, + false, + false, + true); + public static UcdPropertyDetail XID_Start_Detail = + new UcdPropertyDetail( + UcdProperty.XID_Start, + VersionInfo.getInstance(3, 1, 0), + 67, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Continue, + VersionInfo.getInstance(3, 1, 0), + 68, + true, + false, + false, + true); + public static UcdPropertyDetail Other_ID_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.Other_ID_Continue, + VersionInfo.getInstance(4, 1, 0), + 69, + true, + false, + false, + true); + public static UcdPropertyDetail XID_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.XID_Continue, + VersionInfo.getInstance(3, 1, 0), + 70, + true, + false, + false, + true); + public static UcdPropertyDetail Soft_Dotted_Detail = + new UcdPropertyDetail( + UcdProperty.Soft_Dotted, + VersionInfo.getInstance(3, 2, 0), + 71, + true, + false, + false, + true); + public static UcdPropertyDetail Logical_Order_Exception_Detail = + new UcdPropertyDetail( + UcdProperty.Logical_Order_Exception, + VersionInfo.getInstance(3, 2, 0), + 72, + true, + false, + false, + true); + public static UcdPropertyDetail Pattern_White_Space_Detail = + new UcdPropertyDetail( + UcdProperty.Pattern_White_Space, + VersionInfo.getInstance(4, 1, 0), + 73, + true, + false, + false, + true); + public static UcdPropertyDetail Pattern_Syntax_Detail = + new UcdPropertyDetail( + UcdProperty.Pattern_Syntax, + VersionInfo.getInstance(4, 1, 0), + 74, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Cluster_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Cluster_Break, + VersionInfo.getInstance(4, 1, 0), + 75, + true, + false, + false, + true); + public static UcdPropertyDetail Word_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Word_Break, + VersionInfo.getInstance(4, 1, 0), + 76, + true, + false, + false, + true); + public static UcdPropertyDetail Sentence_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Sentence_Break, + VersionInfo.getInstance(4, 1, 0), + 77, + true, + false, + false, + true); + public static UcdPropertyDetail Composition_Exclusion_Detail = + new UcdPropertyDetail( + UcdProperty.Composition_Exclusion, + VersionInfo.getInstance(3, 0, 0), + 78, + true, + false, + false, + true); + public static UcdPropertyDetail Full_Composition_Exclusion_Detail = + new UcdPropertyDetail( + UcdProperty.Full_Composition_Exclusion, + VersionInfo.getInstance(3, 1, 0), + 79, + true, + false, + false, + true); + public static UcdPropertyDetail NFC_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFC_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 80, + true, + false, + false, + true); + public static UcdPropertyDetail NFD_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFD_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 81, + true, + false, + false, + true); + public static UcdPropertyDetail NFKC_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFKC_Quick_Check, + VersionInfo.getInstance(5, 2, 0), + 82, + true, + false, + false, + true); + public static UcdPropertyDetail NFKD_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFKD_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 83, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFC_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFC, + VersionInfo.getInstance(3, 2, 0), + 84, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFD_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFD, + VersionInfo.getInstance(3, 2, 0), + 85, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFKC_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFKC, + VersionInfo.getInstance(3, 2, 0), + 86, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFKD_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFKD, + VersionInfo.getInstance(3, 2, 0), + 87, + true, + false, + false, + true); + public static UcdPropertyDetail FC_NFC_Closure_Detail = + new UcdPropertyDetail( + UcdProperty.FC_NFKC_Closure, + VersionInfo.getInstance(3, 1, 0), + 88, + true, + false, + false, + true); + public static UcdPropertyDetail Case_Ignorable_Detail = + new UcdPropertyDetail( + UcdProperty.Case_Ignorable, + VersionInfo.getInstance(5, 2, 0), + 89, + true, + false, + false, + true); + public static UcdPropertyDetail Cased_Detail = + new UcdPropertyDetail( + UcdProperty.Cased, + VersionInfo.getInstance(5, 2, 0), + 90, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_CaseFolded_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Casefolded, + VersionInfo.getInstance(5, 2, 0), + 91, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_CaseMapped_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Casemapped, + VersionInfo.getInstance(5, 2, 0), + 92, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_NFKC_Casefolded_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_NFKC_Casefolded, + VersionInfo.getInstance(5, 2, 0), + 93, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_Lowercased_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Lowercased, + VersionInfo.getInstance(5, 2, 0), + 94, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_Titlecased_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Titlecased, + VersionInfo.getInstance(5, 2, 0), + 95, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_Uppercased_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Uppercased, + VersionInfo.getInstance(5, 2, 0), + 96, + true, + false, + false, + true); + public static UcdPropertyDetail NFKC_Casefold_Detail = + new UcdPropertyDetail( + UcdProperty.NFKC_Casefold, + VersionInfo.getInstance(5, 2, 0), + 97, + true, + false, + false, + true); + public static UcdPropertyDetail Indic_Syllabic_Category_Detail = + new UcdPropertyDetail( + UcdProperty.Indic_Syllabic_Category, + VersionInfo.getInstance(6, 1, 0), + 98, + true, + false, + false, + true); + // public static UcdPropertyDetail Indic_Matra_Category_Detail = new UcdPropertyDetail ( + // UcdProperty.Indic_Matra_Category, VersionInfo.getInstance(6,1,0), + // VersionInfo.getInstance(7,0,0), 99, + // true, false, false, true); + public static UcdPropertyDetail Indic_Positional_Category_Detail = + new UcdPropertyDetail( + UcdProperty.Indic_Positional_Category, + VersionInfo.getInstance(8, 0, 0), + 100, + true, + false, + false, + true); + public static UcdPropertyDetail kJa_Detail = + new UcdPropertyDetail( + UcdProperty.kJa, + VersionInfo.getInstance(8, 0, 0), + 101, + false, + true, + false, + true); + public static UcdPropertyDetail Prepended_Concatenation_Mark_Detail = + new UcdPropertyDetail( + UcdProperty.Prepended_Concatenation_Mark, + VersionInfo.getInstance(9, 0, 0), + 102, + true, + false, + false, + true); + public static UcdPropertyDetail Vertical_Orientation_Detail = + new UcdPropertyDetail( + UcdProperty.Vertical_Orientation, + VersionInfo.getInstance(10, 0, 0), + 103, + true, + false, + false, + true); + public static UcdPropertyDetail Regional_Indicator_Detail = + new UcdPropertyDetail( + UcdProperty.Regional_Indicator, + VersionInfo.getInstance(10, 0, 0), + 104, + true, + false, + false, + true); + public static UcdPropertyDetail Block_Detail = + new UcdPropertyDetail( + UcdProperty.Block, + VersionInfo.getInstance(2, 0, 0), + 105, + true, + false, + false, + true); + public static UcdPropertyDetail Equivalent_Unified_Ideograph_Detail = + new UcdPropertyDetail( + UcdProperty.Equivalent_Unified_Ideograph, + VersionInfo.getInstance(11, 0, 0), + 106, + false, + true, + false, + true); + public static UcdPropertyDetail kCompatibilityVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kCompatibilityVariant, + VersionInfo.getInstance(3, 2, 0), + 107, + false, + true, + true, + true); + public static UcdPropertyDetail kRSUnicode_Detail = + new UcdPropertyDetail( + UcdProperty.kRSUnicode, + VersionInfo.getInstance(2, 0, 0), + 108, + false, + true, + false, + true); + // public static UcdPropertyDetail kIRG_RSIndex_Detail = new UcdPropertyDetail ( + // UcdProperty.kIRG_RSIndex, VersionInfo.getInstance(11,0,0), 109, + // false, true, false, true); + public static UcdPropertyDetail kIRG_GSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_GSource, + VersionInfo.getInstance(3, 0, 0), + 110, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_TSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_TSource, + VersionInfo.getInstance(3, 0, 0), + 111, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_JSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_JSource, + VersionInfo.getInstance(3, 0, 0), + 112, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_KSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_KSource, + VersionInfo.getInstance(3, 0, 0), + 113, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_KPSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_KPSource, + VersionInfo.getInstance(3, 1, 1), + 114, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_VSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_VSource, + VersionInfo.getInstance(3, 0, 0), + 115, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_HSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_HSource, + VersionInfo.getInstance(3, 1, 0), + 116, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_USource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_USource, + VersionInfo.getInstance(4, 0, 1), + 117, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_MSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_MSource, + VersionInfo.getInstance(5, 2, 0), + 118, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_UKSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_UKSource, + VersionInfo.getInstance(13, 0, 0), + 119, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_SSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_SSource, + VersionInfo.getInstance(13, 0, 0), + 120, + false, + true, + true, + true); + public static UcdPropertyDetail kIICore_Detail = + new UcdPropertyDetail( + UcdProperty.kIICore, + VersionInfo.getInstance(4, 1, 0), + 121, + false, + true, + false, + true); + public static UcdPropertyDetail kUnihanCore2020_Detail = + new UcdPropertyDetail( + UcdProperty.kUnihanCore2020, + VersionInfo.getInstance(13, 0, 0), + 122, + false, + true, + false, + true); + public static UcdPropertyDetail kGB0_Detail = + new UcdPropertyDetail( + UcdProperty.kGB0, + VersionInfo.getInstance(2, 0, 0), + 123, + false, + true, + false, + true); + public static UcdPropertyDetail kGB1_Detail = + new UcdPropertyDetail( + UcdProperty.kGB1, + VersionInfo.getInstance(2, 0, 0), + 124, + false, + true, + false, + true); + public static UcdPropertyDetail kGB3_Detail = + new UcdPropertyDetail( + UcdProperty.kGB3, + VersionInfo.getInstance(2, 0, 0), + 125, + false, + true, + false, + true); + public static UcdPropertyDetail kGB5_Detail = + new UcdPropertyDetail( + UcdProperty.kGB5, + VersionInfo.getInstance(2, 0, 0), + 126, + false, + true, + false, + true); + public static UcdPropertyDetail kGB7_Detail = + new UcdPropertyDetail( + UcdProperty.kGB7, + VersionInfo.getInstance(2, 0, 0), + 127, + false, + true, + false, + true); + public static UcdPropertyDetail kGB8_Detail = + new UcdPropertyDetail( + UcdProperty.kGB8, + VersionInfo.getInstance(2, 0, 0), + 128, + false, + true, + false, + true); + public static UcdPropertyDetail kCNS1986_Detail = + new UcdPropertyDetail( + UcdProperty.kCNS1986, + VersionInfo.getInstance(2, 0, 0), + 129, + false, + true, + false, + true); + public static UcdPropertyDetail kCNS1992_Detail = + new UcdPropertyDetail( + UcdProperty.kCNS1992, + VersionInfo.getInstance(2, 0, 0), + 130, + false, + true, + false, + true); + public static UcdPropertyDetail kJis0_Detail = + new UcdPropertyDetail( + UcdProperty.kJis0, + VersionInfo.getInstance(2, 0, 0), + 131, + false, + true, + false, + true); + public static UcdPropertyDetail kJis1_Detail = + new UcdPropertyDetail( + UcdProperty.kJis1, + VersionInfo.getInstance(2, 0, 0), + 132, + false, + true, + false, + true); + public static UcdPropertyDetail kJIS0213_Detail = + new UcdPropertyDetail( + UcdProperty.kJIS0213, + VersionInfo.getInstance(3, 1, 1), + 133, + false, + true, + false, + true); + public static UcdPropertyDetail kKSC0_Detail = + new UcdPropertyDetail( + UcdProperty.kKSC0, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 134, + false, + true, + false, + true); + public static UcdPropertyDetail kKSC1_Detail = + new UcdPropertyDetail( + UcdProperty.kKSC1, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 135, + false, + true, + false, + true); + public static UcdPropertyDetail kKPS0_Detail = + new UcdPropertyDetail( + UcdProperty.kKPS0, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 136, + false, + true, + false, + true); + public static UcdPropertyDetail kKPS1_Detail = + new UcdPropertyDetail( + UcdProperty.kKPS1, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 137, + false, + true, + false, + true); + public static UcdPropertyDetail kHKSCS_Detail = + new UcdPropertyDetail( + UcdProperty.kHKSCS, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 138, + false, + true, + false, + true); + public static UcdPropertyDetail kCantonese_Detail = + new UcdPropertyDetail( + UcdProperty.kCantonese, + VersionInfo.getInstance(2, 0, 0), + 139, + false, + true, + false, + true); + public static UcdPropertyDetail kHangul_Detail = + new UcdPropertyDetail( + UcdProperty.kHangul, + VersionInfo.getInstance(5, 0, 0), + 140, + false, + true, + false, + true); + public static UcdPropertyDetail kDefinition_Detail = + new UcdPropertyDetail( + UcdProperty.kDefinition, + VersionInfo.getInstance(2, 0, 0), + 141, + false, + true, + false, + true); + public static UcdPropertyDetail kHanYu_Detail = + new UcdPropertyDetail( + UcdProperty.kHanYu, + VersionInfo.getInstance(2, 0, 0), + 142, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateHanYu_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateHanYu, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(3,1,1), 143, + // false, true, false, true); + public static UcdPropertyDetail kMandarin_Detail = + new UcdPropertyDetail( + UcdProperty.kMandarin, + VersionInfo.getInstance(2, 0, 0), + 144, + false, + true, + false, + true); + public static UcdPropertyDetail kCihaiT_Detail = + new UcdPropertyDetail( + UcdProperty.kCihaiT, + VersionInfo.getInstance(3, 2, 0), + 145, + false, + true, + false, + true); + public static UcdPropertyDetail kSBGY_Detail = + new UcdPropertyDetail( + UcdProperty.kSBGY, + VersionInfo.getInstance(3, 2, 0), + 146, + false, + true, + false, + true); + public static UcdPropertyDetail kNelson_Detail = + new UcdPropertyDetail( + UcdProperty.kNelson, + VersionInfo.getInstance(2, 0, 0), + 147, + false, + true, + false, + true); + public static UcdPropertyDetail kCowles_Detail = + new UcdPropertyDetail( + UcdProperty.kCowles, + VersionInfo.getInstance(3, 1, 1), + 148, + false, + true, + false, + true); + public static UcdPropertyDetail kMatthews_Detail = + new UcdPropertyDetail( + UcdProperty.kMatthews, + VersionInfo.getInstance(2, 0, 0), + 149, + false, + true, + false, + true); + public static UcdPropertyDetail kOtherNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kOtherNumeric, + VersionInfo.getInstance(3, 2, 0), + 150, + false, + true, + false, + true); + public static UcdPropertyDetail kPhonetic_Detail = + new UcdPropertyDetail( + UcdProperty.kPhonetic, + VersionInfo.getInstance(3, 1, 0), + 151, + false, + true, + false, + true); + public static UcdPropertyDetail kGSR_Detail = + new UcdPropertyDetail( + UcdProperty.kGSR, + VersionInfo.getInstance(4, 0, 1), + 152, + false, + true, + false, + true); + public static UcdPropertyDetail kFenn_Detail = + new UcdPropertyDetail( + UcdProperty.kFenn, + VersionInfo.getInstance(3, 1, 1), + 153, + false, + true, + false, + true); + public static UcdPropertyDetail kFennIndex_Detail = + new UcdPropertyDetail( + UcdProperty.kFennIndex, + VersionInfo.getInstance(4, 1, 0), + 154, + false, + true, + false, + true); + public static UcdPropertyDetail kKarlgren_Detail = + new UcdPropertyDetail( + UcdProperty.kKarlgren, + VersionInfo.getInstance(3, 1, 1), + 155, + false, + true, + false, + true); + public static UcdPropertyDetail kCangjie_Detail = + new UcdPropertyDetail( + UcdProperty.kCangjie, + VersionInfo.getInstance(3, 1, 1), + 156, + false, + true, + false, + true); + public static UcdPropertyDetail kMeyerWempe_Detail = + new UcdPropertyDetail( + UcdProperty.kMeyerWempe, + VersionInfo.getInstance(3, 1, 0), + 157, + false, + true, + false, + true); + public static UcdPropertyDetail kSimplifiedVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSimplifiedVariant, + VersionInfo.getInstance(2, 0, 0), + 158, + false, + true, + false, + true); + public static UcdPropertyDetail kTraditionalVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kTraditionalVariant, + VersionInfo.getInstance(2, 0, 0), + 159, + false, + true, + false, + true); + public static UcdPropertyDetail kSpecializedSemanticVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSpecializedSemanticVariant, + VersionInfo.getInstance(2, 0, 0), + 160, + false, + true, + false, + true); + public static UcdPropertyDetail kSemanticVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSemanticVariant, + VersionInfo.getInstance(2, 0, 0), + 161, + false, + true, + false, + true); + public static UcdPropertyDetail kVietnamese_Detail = + new UcdPropertyDetail( + UcdProperty.kVietnamese, + VersionInfo.getInstance(3, 1, 1), + 162, + false, + true, + false, + true); + public static UcdPropertyDetail kLau_Detail = + new UcdPropertyDetail( + UcdProperty.kLau, + VersionInfo.getInstance(3, 1, 1), + 163, + false, + true, + false, + true); + public static UcdPropertyDetail kTang_Detail = + new UcdPropertyDetail( + UcdProperty.kTang, + VersionInfo.getInstance(2, 0, 0), + 164, + false, + true, + false, + true); + public static UcdPropertyDetail kZVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kZVariant, + VersionInfo.getInstance(2, 0, 0), + 165, + false, + true, + false, + true); + public static UcdPropertyDetail kJapaneseKun_Detail = + new UcdPropertyDetail( + UcdProperty.kJapaneseKun, + VersionInfo.getInstance(2, 0, 0), + 166, + false, + true, + false, + true); + public static UcdPropertyDetail kJapaneseOn_Detail = + new UcdPropertyDetail( + UcdProperty.kJapaneseOn, + VersionInfo.getInstance(2, 0, 0), + 167, + false, + true, + false, + true); + public static UcdPropertyDetail kKangXi_Detail = + new UcdPropertyDetail( + UcdProperty.kKangXi, + VersionInfo.getInstance(2, 0, 0), + 168, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateKangXi_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateKangXi, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(4,0,1), 169, + // false, true, false, true); + public static UcdPropertyDetail kBigFive_Detail = + new UcdPropertyDetail( + UcdProperty.kBigFive, + VersionInfo.getInstance(2, 0, 0), + 170, + false, + true, + false, + true); + public static UcdPropertyDetail kCCCII_Detail = + new UcdPropertyDetail( + UcdProperty.kCCCII, + VersionInfo.getInstance(2, 0, 0), + 171, + false, + true, + false, + true); + public static UcdPropertyDetail kDaeJaweon_Detail = + new UcdPropertyDetail( + UcdProperty.kDaeJaweon, + VersionInfo.getInstance(2, 0, 0), + 172, + false, + true, + false, + true); + public static UcdPropertyDetail kEACC_Detail = + new UcdPropertyDetail( + UcdProperty.kEACC, + VersionInfo.getInstance(2, 0, 0), + 173, + false, + true, + false, + true); + public static UcdPropertyDetail kFrequency_Detail = + new UcdPropertyDetail( + UcdProperty.kFrequency, + VersionInfo.getInstance(3, 2, 0), + VersionInfo.getInstance(16, 0, 0), + 174, + false, + true, + false, + true); + public static UcdPropertyDetail kGradeLevel_Detail = + new UcdPropertyDetail( + UcdProperty.kGradeLevel, + VersionInfo.getInstance(3, 2, 0), + 175, + false, + true, + false, + true); + public static UcdPropertyDetail kHDZRadBreak_Detail = + new UcdPropertyDetail( + UcdProperty.kHDZRadBreak, + VersionInfo.getInstance(4, 1, 0), + 176, + false, + true, + false, + true); + public static UcdPropertyDetail kHKGlyph_Detail = + new UcdPropertyDetail( + UcdProperty.kHKGlyph, + VersionInfo.getInstance(3, 1, 1), + 177, + false, + true, + false, + true); + public static UcdPropertyDetail kHanyuPinlu_Detail = + new UcdPropertyDetail( + UcdProperty.kHanyuPinlu, + VersionInfo.getInstance(4, 0, 1), + 178, + false, + true, + false, + true); + public static UcdPropertyDetail kHanyuPinyin_Detail = + new UcdPropertyDetail( + UcdProperty.kHanyuPinyin, + VersionInfo.getInstance(5, 2, 0), + 179, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGHanyuDaZidian_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGHanyuDaZidian, + VersionInfo.getInstance(3, 0, 0), + 180, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGKangXi_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGKangXi, + VersionInfo.getInstance(3, 0, 0), + 181, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGDaeJaweon_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGDaeJaweon, + VersionInfo.getInstance(3, 0, 0), + 182, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGDaiKanwaZiten_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGDaiKanwaZiten, + VersionInfo.getInstance(3, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 183, + false, + true, + false, + true); + public static UcdPropertyDetail kKorean_Detail = + new UcdPropertyDetail( + UcdProperty.kKorean, + VersionInfo.getInstance(2, 0, 0), + 184, + false, + true, + false, + true); + public static UcdPropertyDetail kMainlandTelegraph_Detail = + new UcdPropertyDetail( + UcdProperty.kMainlandTelegraph, + VersionInfo.getInstance(2, 0, 0), + 185, + false, + true, + false, + true); + public static UcdPropertyDetail kMorohashi_Detail = + new UcdPropertyDetail( + UcdProperty.kMorohashi, + VersionInfo.getInstance(2, 0, 0), + 186, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateMorohashi_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateMorohashi, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(4,0,1), 187, + // false, true, false, true); + public static UcdPropertyDetail kPrimaryNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kPrimaryNumeric, + VersionInfo.getInstance(3, 2, 0), + 188, + false, + true, + false, + true); + public static UcdPropertyDetail kTaiwanTelegraph_Detail = + new UcdPropertyDetail( + UcdProperty.kTaiwanTelegraph, + VersionInfo.getInstance(2, 0, 0), + 189, + false, + true, + false, + true); + public static UcdPropertyDetail kXerox_Detail = + new UcdPropertyDetail( + UcdProperty.kXerox, + VersionInfo.getInstance(2, 0, 0), + 190, + false, + true, + false, + true); + public static UcdPropertyDetail kPseudoGB1_Detail = + new UcdPropertyDetail( + UcdProperty.kPseudoGB1, + VersionInfo.getInstance(2, 0, 0), + 191, + false, + true, + false, + true); + public static UcdPropertyDetail kIBMJapan_Detail = + new UcdPropertyDetail( + UcdProperty.kIBMJapan, + VersionInfo.getInstance(2, 0, 0), + 192, + false, + true, + false, + true); + public static UcdPropertyDetail kAccountingNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kAccountingNumeric, + VersionInfo.getInstance(3, 2, 0), + 193, + false, + true, + false, + true); + public static UcdPropertyDetail kCheungBauer_Detail = + new UcdPropertyDetail( + UcdProperty.kCheungBauer, + VersionInfo.getInstance(5, 0, 0), + 194, + false, + true, + false, + true); + public static UcdPropertyDetail kCheungBauerIndex_Detail = + new UcdPropertyDetail( + UcdProperty.kCheungBauerIndex, + VersionInfo.getInstance(5, 0, 0), + 195, + false, + true, + false, + true); + public static UcdPropertyDetail kFourCornerCode_Detail = + new UcdPropertyDetail( + UcdProperty.kFourCornerCode, + VersionInfo.getInstance(5, 0, 0), + 196, + false, + true, + false, + true); + // public static UcdPropertyDetail kWubi_Detail = new UcdPropertyDetail ( + // UcdProperty.kWubi, VersionInfo.getInstance(11,0,0), 197, + // false, true, false, true); + public static UcdPropertyDetail kXHC1983_Detail = + new UcdPropertyDetail( + UcdProperty.kXHC1983, + VersionInfo.getInstance(5, 1, 0), + 198, + false, + true, + false, + true); + public static UcdPropertyDetail kJinmeiyoKanji_Detail = + new UcdPropertyDetail( + UcdProperty.kJinmeiyoKanji, + VersionInfo.getInstance(11, 0, 0), + 199, + false, + true, + false, + true); + public static UcdPropertyDetail kJoyoKanji_Detail = + new UcdPropertyDetail( + UcdProperty.kJoyoKanji, + VersionInfo.getInstance(11, 0, 0), + 200, + false, + true, + false, + true); + public static UcdPropertyDetail kKoreanEducationHanja_Detail = + new UcdPropertyDetail( + UcdProperty.kKoreanEducationHanja, + VersionInfo.getInstance(11, 0, 0), + 201, + false, + true, + false, + true); + public static UcdPropertyDetail kKoreanName_Detail = + new UcdPropertyDetail( + UcdProperty.kKoreanName, + VersionInfo.getInstance(11, 0, 0), + 202, + false, + true, + false, + true); + public static UcdPropertyDetail kTGH_Detail = + new UcdPropertyDetail( + UcdProperty.kTGH, + VersionInfo.getInstance(11, 0, 0), + 203, + false, + true, + false, + true); + public static UcdPropertyDetail kTGHZ2013_Detail = + new UcdPropertyDetail( + UcdProperty.kTGHZ2013, + VersionInfo.getInstance(13, 0, 0), + 204, + false, + true, + false, + true); + public static UcdPropertyDetail kSpoofingVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSpoofingVariant, + VersionInfo.getInstance(13, 0, 0), + 205, + false, + true, + false, + true); + public static UcdPropertyDetail kRSKanWa_Detail = + new UcdPropertyDetail( + UcdProperty.kRSKanWa, + VersionInfo.getInstance(2, 0, 0), + 206, + false, + true, + false, + true); + public static UcdPropertyDetail kRSJapanese_Detail = + new UcdPropertyDetail( + UcdProperty.kRSJapanese, + VersionInfo.getInstance(2, 0, 0), + 207, + false, + true, + false, + true); + public static UcdPropertyDetail kRSKorean_Detail = + new UcdPropertyDetail( + UcdProperty.kRSKorean, + VersionInfo.getInstance(2, 0, 0), + 208, + false, + true, + false, + true); + public static UcdPropertyDetail kRSKangXi_Detail = + new UcdPropertyDetail( + UcdProperty.kRSKangXi, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 209, + false, + true, + false, + true); + public static UcdPropertyDetail kRSAdobe_Japan1_6_Detail = + new UcdPropertyDetail( + UcdProperty.kRSAdobe_Japan1_6, + VersionInfo.getInstance(4, 1, 0), + 210, + false, + true, + false, + true); + public static UcdPropertyDetail kTotalStrokes_Detail = + new UcdPropertyDetail( + UcdProperty.kTotalStrokes, + VersionInfo.getInstance(3, 1, 0), + 211, + false, + true, + false, + true); + public static UcdPropertyDetail kRSTUnicode_Detail = + new UcdPropertyDetail( + UcdProperty.kRSTUnicode, + VersionInfo.getInstance(9, 0, 0), + 212, + false, + true, + false, + true); + public static UcdPropertyDetail kTGT_MergedSrc_Detail = + new UcdPropertyDetail( + UcdProperty.kTGT_MergedSrc, + VersionInfo.getInstance(9, 0, 0), + 213, + false, + true, + false, + true); + public static UcdPropertyDetail kSrc_NushuDuben_Detail = + new UcdPropertyDetail( + UcdProperty.kSrc_NushuDuben, + VersionInfo.getInstance(10, 0, 0), + 214, + false, + true, + false, + true); + public static UcdPropertyDetail kReading_Detail = + new UcdPropertyDetail( + UcdProperty.kReading, + VersionInfo.getInstance(10, 0, 0), + 215, + false, + true, + false, + true); + public static UcdPropertyDetail ISO_Comment_Detail = + new UcdPropertyDetail( + UcdProperty.ISO_Comment, + VersionInfo.getInstance(11, 0, 0), + 216, + true, + false, + false, + true); + public static UcdPropertyDetail Unicode_1_Name_Detail = + new UcdPropertyDetail( + UcdProperty.Unicode_1_Name, + VersionInfo.getInstance(2, 0, 0), + 217, + true, + false, + false, + true); + public static UcdPropertyDetail Name_Alias_Detail = + new UcdPropertyDetail( + UcdProperty.Name_Alias, + VersionInfo.getInstance(5, 0, 0), + 218, + false, + false, + false, + true); + public static UcdPropertyDetail Emoji_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji, + VersionInfo.getInstance(13, 0, 0), + 219, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Presentation_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Presentation, + VersionInfo.getInstance(13, 0, 0), + 220, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Modifier_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Modifier, + VersionInfo.getInstance(13, 0, 0), + 221, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Modifier_Base_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Modifier_Base, + VersionInfo.getInstance(13, 0, 0), + 222, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Component_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Component, + VersionInfo.getInstance(13, 0, 0), + 223, + true, + false, + false, + true); + public static UcdPropertyDetail Extended_Pictographic_Detail = + new UcdPropertyDetail( + UcdProperty.Extended_Pictographic, + VersionInfo.getInstance(13, 0, 0), + 224, + true, + false, + false, + true); + public static UcdPropertyDetail kStrange_Detail = + new UcdPropertyDetail( + UcdProperty.kStrange, + VersionInfo.getInstance(14, 0, 0), + 225, + false, + true, + false, + true); + public static UcdPropertyDetail kAlternateTotalStrokes_Detail = + new UcdPropertyDetail( + UcdProperty.kAlternateTotalStrokes, + VersionInfo.getInstance(15, 0, 0), + 226, + false, + true, + false, + true); + public static UcdPropertyDetail NFKC_Simple_Casefold_Detail = + new UcdPropertyDetail( + UcdProperty.NFKC_Simple_Casefold, + VersionInfo.getInstance(15, 1, 0), + 227, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Compat_Math_Start_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Compat_Math_Start, + VersionInfo.getInstance(15, 1, 0), + 228, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Compat_Math_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Compat_Math_Continue, + VersionInfo.getInstance(15, 1, 0), + 229, + true, + false, + false, + true); + public static UcdPropertyDetail IDS_Unary_Operator_Detail = + new UcdPropertyDetail( + UcdProperty.IDS_Unary_Operator, + VersionInfo.getInstance(15, 1, 0), + 230, + true, + false, + false, + true); + public static UcdPropertyDetail kJapanese_Detail = + new UcdPropertyDetail( + UcdProperty.kJapanese, + VersionInfo.getInstance(15, 1, 0), + 231, + false, + true, + false, + true); + public static UcdPropertyDetail kMojiJoho_Detail = + new UcdPropertyDetail( + UcdProperty.kMojiJoho, + VersionInfo.getInstance(15, 1, 0), + 232, + false, + true, + false, + true); + public static UcdPropertyDetail kSMSZD2003Index_Detail = + new UcdPropertyDetail( + UcdProperty.kSMSZD2003Index, + VersionInfo.getInstance(15, 1, 0), + 233, + false, + true, + false, + true); + public static UcdPropertyDetail kSMSZD2003Readings_Detail = + new UcdPropertyDetail( + UcdProperty.kSMSZD2003Readings, + VersionInfo.getInstance(15, 1, 0), + 234, + false, + true, + false, + true); + public static UcdPropertyDetail kVietnameseNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kVietnameseNumeric, + VersionInfo.getInstance(15, 1, 0), + 235, + false, + true, + false, + true); + public static UcdPropertyDetail kZhuangNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kZhuangNumeric, + VersionInfo.getInstance(15, 1, 0), + 236, + false, + true, + false, + true); + public static UcdPropertyDetail Indic_Conjunct_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Indic_Conjunct_Break, + VersionInfo.getInstance(15, 1, 0), + 237, + true, + false, + false, + true); + public static UcdPropertyDetail Modifier_Combining_Mark_Detail = + new UcdPropertyDetail( + UcdProperty.Modifier_Combining_Mark, + VersionInfo.getInstance(16, 0, 0), + 238, + true, + false, + false, + true); + public static UcdPropertyDetail kFanqie_Detail = + new UcdPropertyDetail( + UcdProperty.kFanqie, + VersionInfo.getInstance(16, 0, 0), + 239, + false, + true, + false, + true); + public static UcdPropertyDetail kZhuang_Detail = + new UcdPropertyDetail( + UcdProperty.kZhuang, + VersionInfo.getInstance(16, 0, 0), + 240, + false, + true, + false, + true); + public static UcdPropertyDetail Basic_Emoji_Detail = + new UcdPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); + public static UcdPropertyDetail CJK_Radical_Detail = + new UcdPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); + public static UcdPropertyDetail Confusable_MA_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); + public static UcdPropertyDetail Confusable_ML_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); + public static UcdPropertyDetail Confusable_SA_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); + public static UcdPropertyDetail Confusable_SL_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); + public static UcdPropertyDetail Do_Not_Emit_Preferred_Detail = + new UcdPropertyDetail( + UcdProperty.Do_Not_Emit_Preferred, -7, false, false, false, false); + public static UcdPropertyDetail Do_Not_Emit_Type_Detail = + new UcdPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); + public static UcdPropertyDetail Emoji_DCM_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_DCM, + VersionInfo.getInstance(6, 0, 0), + -9, + false, + false, + false, + false); + public static UcdPropertyDetail Emoji_KDDI_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_KDDI, + VersionInfo.getInstance(6, 0, 0), + -10, + false, + false, + false, + false); + public static UcdPropertyDetail Emoji_SB_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_SB, + VersionInfo.getInstance(6, 0, 0), + -11, + false, + false, + false, + false); + public static UcdPropertyDetail Identifier_Status_Detail = + new UcdPropertyDetail( + UcdProperty.Identifier_Status, + VersionInfo.getInstance(9, 0, 0), + -12, + false, + false, + false, + false); + public static UcdPropertyDetail Identifier_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Identifier_Type, + VersionInfo.getInstance(9, 0, 0), + -13, + false, + false, + false, + false); + public static UcdPropertyDetail Idn_2008_Detail = + new UcdPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); + public static UcdPropertyDetail Idn_Mapping_Detail = + new UcdPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); + public static UcdPropertyDetail Idn_Status_Detail = + new UcdPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); + public static UcdPropertyDetail Named_Sequences_Detail = + new UcdPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); + public static UcdPropertyDetail Named_Sequences_Prov_Detail = + new UcdPropertyDetail( + UcdProperty.Named_Sequences_Prov, -18, false, false, false, false); + public static UcdPropertyDetail Other_Joining_Type_Detail = + new UcdPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Flag_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Flag_Sequence, -20, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Keycap_Sequence, -21, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Modifier_Sequence, -22, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Tag_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Tag_Sequence, -23, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Zwj_Sequence, -24, false, false, false, false); + public static UcdPropertyDetail Standardized_Variant_Detail = + new UcdPropertyDetail( + UcdProperty.Standardized_Variant, -25, false, false, false, false); + + private UcdProperty ucdProperty; + private VersionInfo minVersion; + private VersionInfo maxVersion; + private int sortOrder; + private boolean isBaseAttribute; + private boolean isCJKAttribute; + private boolean isCJKShowIfEmpty; + private boolean isOrgUCDXMLAttribute; + + private UcdPropertyDetail( + UcdProperty ucdProperty, + VersionInfo minVersion, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this( + ucdProperty, + minVersion, + null, + sortOrder, + isBaseAttribute, + isCJKAttribute, + isCJKShowIfEmpty, + isOrgUCDXMLAttribute); + } + + private UcdPropertyDetail( + UcdProperty ucdProperty, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this( + ucdProperty, + null, + null, + sortOrder, + isBaseAttribute, + isCJKAttribute, + isCJKShowIfEmpty, + isOrgUCDXMLAttribute); + } + + private UcdPropertyDetail( + UcdProperty ucdProperty, + VersionInfo minVersion, + VersionInfo maxVersion, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this.ucdProperty = ucdProperty; + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.sortOrder = sortOrder; + this.isBaseAttribute = isBaseAttribute; + this.isCJKAttribute = isCJKAttribute; + this.isCJKShowIfEmpty = isCJKShowIfEmpty; + this.isOrgUCDXMLAttribute = isOrgUCDXMLAttribute; + + allPropertyDetails.add(this); + if (isBaseAttribute) { + basePropertyDetails.add(this); + ucdxmlPropertyDetails.add(this); + } + if (isCJKAttribute) { + cjkPropertyDetails.add(this); + ucdxmlPropertyDetails.add(this); + } + } + + public static Set values() { + return allPropertyDetails; + } + + public static Set baseValues() { + return basePropertyDetails; + } + + public static Set cjkValues() { + return cjkPropertyDetails; + } + + public static Set ucdxmlValues() { + return ucdxmlPropertyDetails; + } + + public UcdProperty getUcdProperty() { + return this.ucdProperty; + } + + public VersionInfo getMinVersion() { + return this.minVersion; + } + + public VersionInfo getMaxVersion() { + return this.maxVersion; + } + + public boolean isBaseAttribute() { + return this.isBaseAttribute; + } + + public boolean isCJKAttribute() { + return this.isCJKAttribute; + } + + public boolean isCJKShowIfEmpty() { + return this.isCJKShowIfEmpty; + } + + public boolean isOrgUCDXMLAttribute() { + return this.isOrgUCDXMLAttribute; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java b/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java new file mode 100644 index 0000000000..0773486ccf --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java @@ -0,0 +1,28 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdProperty; + +public class UcdSectionComponent { + private final VersionInfo minVersion; + private final VersionInfo maxVersion; + private final UcdProperty ucdProperty; + + UcdSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.ucdProperty = ucdProperty; + } + + public VersionInfo getMinVersion() { + return this.minVersion; + } + + public VersionInfo getMaxVersion() { + return this.maxVersion; + } + + public UcdProperty getUcdProperty() { + return this.ucdProperty; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java new file mode 100644 index 0000000000..ceed693afd --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java @@ -0,0 +1,224 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdProperty; + +public class UcdSectionDetail { + + public enum UcdSection { + BLOCKS( + "blocks", + "block", + VersionInfo.getInstance(1, 1, 0), + null, + Blocks_Detail, + true, + true), + CJKRADICALS( + "cjk-radicals", + "cjk-radical", + VersionInfo.getInstance(1, 1, 0), + null, + CJKRadicals_Detail, + false, + false), + DONOTEMIT( + "do-not-emit", + "instead", + VersionInfo.getInstance(16, 0, 0), + null, + DoNotEmit_Detail, + false, + false), + EMOJISOURCES( + "emoji-sources", + "emoji-source", + VersionInfo.getInstance(1, 1, 0), + null, + EmojiSources_Detail, + true, + false), + NAMEDSEQUENCES( + "named-sequences", + "named-sequence", + VersionInfo.getInstance(1, 1, 0), + null, + NamedSequences_Detail, + false, + false), + PROVISIONALNAMEDSEQUENCES( + "provisional-named-sequences", + "named-sequence", + VersionInfo.getInstance(5, 0, 0), + VersionInfo.getInstance(13, 0, 0), + ProvisionalNamedSequences_Detail, + false, + false), + NORMALIZATIONCORRECTIONS( + "normalization-corrections", + "normalization-correction", + VersionInfo.getInstance(1, 1, 0), + null, + NormalizationCorrections_Detail, + true, + false), + STANDARDIZEDVARIANTS( + "standardized-variants", + "standardized-variant", + VersionInfo.getInstance(1, 1, 0), + null, + StandardizedVariants_Detail, + true, + false); + private final String tag; + private final String childTag; + private final VersionInfo minVersion; + private final VersionInfo maxVersion; + private final UcdSectionDetail ucdSectionDetail; + private final boolean parserWithRange; + private final boolean parserWithMissing; + + UcdSection( + String tag, + String childTag, + VersionInfo minVersion, + VersionInfo maxVersion, + UcdSectionDetail ucdSectionDetail, + boolean parserWithRange, + boolean parserWithMissing) { + this.tag = tag; + this.childTag = childTag; + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.ucdSectionDetail = ucdSectionDetail; + this.parserWithRange = parserWithRange; + this.parserWithMissing = parserWithMissing; + } + + public String toString() { + return tag; + } + + public String getChildTag() { + return childTag; + } + + public VersionInfo getMinVersion() { + return minVersion; + } + + public VersionInfo getMaxVersion() { + return maxVersion; + } + + public UcdSectionDetail getUcdSectionDetail() { + return ucdSectionDetail; + } + + public boolean getParserWithRange() { + return parserWithRange; + } + + public boolean getParserWithMissing() { + return parserWithMissing; + } + } + + public static UcdSectionDetail Blocks_Detail = + new UcdSectionDetail( + UcdSection.BLOCKS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Block) + }, + 0); + public static UcdSectionDetail NamedSequences_Detail = + new UcdSectionDetail( + UcdSection.NAMEDSEQUENCES, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Named_Sequences) + }, + 1); + public static UcdSectionDetail ProvisionalNamedSequences_Detail = + new UcdSectionDetail( + UcdSection.PROVISIONALNAMEDSEQUENCES, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(5, 0, 0), + VersionInfo.getInstance(13, 0, 0), + UcdProperty.Named_Sequences_Prov) + }, + 1); + public static UcdSectionDetail NormalizationCorrections_Detail = + new UcdSectionDetail( + UcdSection.NORMALIZATIONCORRECTIONS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.NC_Original) + }, + 2); + public static UcdSectionDetail StandardizedVariants_Detail = + new UcdSectionDetail( + UcdSection.STANDARDIZEDVARIANTS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.Standardized_Variant), + new UcdSectionComponent( + VersionInfo.getInstance(13, 0, 0), + null, + UcdProperty.emoji_variation_sequence) + }, + 3); + public static UcdSectionDetail CJKRadicals_Detail = + new UcdSectionDetail( + UcdSection.CJKRADICALS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.CJK_Radical) + }, + 4); + public static UcdSectionDetail EmojiSources_Detail = + new UcdSectionDetail( + UcdSection.EMOJISOURCES, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Emoji_DCM) + }, + 5); + public static UcdSectionDetail DoNotEmit_Detail = + new UcdSectionDetail( + UcdSection.DONOTEMIT, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.Do_Not_Emit_Type) + }, + 6); + + private final UcdSection ucdSection; + private final UcdSectionComponent[] ucdSectionComponents; + private final int sortOrder; + + private UcdSectionDetail( + UcdSection ucdSection, UcdSectionComponent[] ucdSectionComponents, int sortOrder) { + this.ucdSection = ucdSection; + this.ucdSectionComponents = ucdSectionComponents; + this.sortOrder = sortOrder; + } + + public UcdSection getSection() { + return this.ucdSection; + } + + public UcdSectionComponent[] getUcdSectionComponents() { + return this.ucdSectionComponents; + } + + public int getSortOrder() { + return this.sortOrder; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdXML.java b/unicodetools/src/main/java/org/unicode/xml/UcdXML.java new file mode 100644 index 0000000000..c71ac10826 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdXML.java @@ -0,0 +1,825 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.util.VersionInfo; +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.xml.transform.TransformerConfigurationException; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class UcdXML { + + private static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; + + private enum UCDXMLOUTPUTRANGE { + ALL, + NOUNIHAN, + UNIHAN; + } + + private enum UCDXMLOUTPUTTYPE { + FLAT, + GROUPED; + } + + private enum Range { + RESERVED("reserved"), + SURROGATE("surrogate"), + NONCHARACTER("noncharacter"), + CHARACTER("char"), + CJKUNIFIEDIDEOGRAPH("char"), + NONRANGE("nonrange"); + + private final String tag; + + Range(String tag) { + this.tag = tag; + } + + public String toString() { + return tag; + } + } + + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("ucdversion", 'v', UOption.REQUIRES_ARG), + UOption.create("range", 'r', UOption.REQUIRES_ARG), + UOption.create("output", 'o', UOption.REQUIRES_ARG), + UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + }; + private static final int HELP = 0, UCDVERSION = 1, RANGE = 2, OUTPUT = 3, OUTPUTFOLDER = 4; + + public static void main(String[] args) throws Exception { + + VersionInfo ucdVersion = null; + UCDXMLOUTPUTRANGE[] ucdxmloutputranges = + new UCDXMLOUTPUTRANGE[] { + UCDXMLOUTPUTRANGE.ALL, UCDXMLOUTPUTRANGE.NOUNIHAN, UCDXMLOUTPUTRANGE.UNIHAN + }; + UCDXMLOUTPUTTYPE[] ucdxmloutputtypes = + new UCDXMLOUTPUTTYPE[] {UCDXMLOUTPUTTYPE.FLAT, UCDXMLOUTPUTTYPE.GROUPED}; + File destinationFolder = null; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println( + "UcdXML --ucdversion {version number} --outputfolder {destination} " + + "--range [ALL|NOUNIHAN|UNIHAN] --output [FLAT|GROUPED]"); + System.exit(0); + } + + try { + if (options[UCDVERSION].doesOccur) { + try { + ucdVersion = VersionInfo.getInstance(options[UCDVERSION].value); + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[UCDVERSION].value + + " to a valid UCD version"); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --ucdversion (or -v)"); + } + if (options[RANGE].doesOccur) { + try { + ucdxmloutputranges = + new UCDXMLOUTPUTRANGE[] { + UCDXMLOUTPUTRANGE.valueOf( + options[RANGE].value.toUpperCase(Locale.ROOT)) + }; + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[RANGE].value + + " to one of [ALL|NOUNIHAN|UNIHAN]"); + } + } + if (options[OUTPUT].doesOccur) { + try { + ucdxmloutputtypes = + new UCDXMLOUTPUTTYPE[] { + UCDXMLOUTPUTTYPE.valueOf( + options[OUTPUT].value.toUpperCase(Locale.ROOT)) + }; + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[OUTPUT].value + + " to one of [FLAT|GROUPED]"); + } + } + if (options[OUTPUTFOLDER].doesOccur) { + try { + destinationFolder = + new File( + options[OUTPUTFOLDER].value + + getVersionString(ucdVersion, 3) + + "\\xmltest\\"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdir()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + options[OUTPUTFOLDER].value); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --outputfolder (or -f)"); + } + + } catch (Exception e) { + System.err.println(e.getMessage()); + System.exit(1); + } + + if (ucdVersion != null && destinationFolder.exists()) { + for (UCDXMLOUTPUTRANGE ucdxmloutputrange : ucdxmloutputranges) { + for (UCDXMLOUTPUTTYPE ucdxmloutputtype : ucdxmloutputtypes) { + System.out.println( + "Building the " + + ucdxmloutputrange + + " " + + ucdxmloutputtype + + " UcdXML file for " + + ucdVersion); + buildUcdXMLFile( + ucdVersion, destinationFolder, ucdxmloutputrange, ucdxmloutputtype); + } + } + System.out.println("End"); + System.exit(0); + } else { + System.err.println("Unexpected error when building UcdXML file."); + System.exit(1); + } + } + + private static void buildUcdXMLFile( + VersionInfo ucdVersion, + File destinationFolder, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws IOException, TransformerConfigurationException, SAXException { + int lowCodepoint = 0x0; + int highCodepoint = 0x10FFFF; + // Tangut + // int lowCodepoint = 0x17000; + // int highCodepoint = 0x1B2FB; + // 0x10FFFF + + File tempFile = new File(destinationFolder, "temp.xml"); + String outputFilename = + "ucd." + + outputRange.toString().toLowerCase(Locale.ROOT) + + "." + + outputType.toString().toLowerCase(Locale.ROOT) + + ".xml"; + File destinationFile = new File(destinationFolder, outputFilename); + + FileOutputStream fileOutputStream = new FileOutputStream(tempFile); + UCDXMLWriter writer = new UCDXMLWriter(fileOutputStream); + + IndexUnicodeProperties iup = IndexUnicodeProperties.make(ucdVersion); + AttributeResolver attributeResolver = new AttributeResolver(iup); + UCDDataResolver ucdDataResolver = new UCDDataResolver(iup, NAMESPACE, writer); + + writer.startFile(); + writer.startElement("ucd"); + { + writer.startElement("description"); + { + writer.addContent("Unicode " + getVersionString(ucdVersion, 3)); + writer.endElement("description"); + } + buildRepertoire( + writer, + attributeResolver, + ucdVersion, + lowCodepoint, + highCodepoint, + outputRange, + outputType); + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.BLOCKS); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NAMEDSEQUENCES); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.STANDARDIZEDVARIANTS); + if (ucdVersion.compareTo(VersionInfo.getInstance(5, 2, 0)) >= 0) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.CJKRADICALS); + } + if (ucdVersion.compareTo(VersionInfo.getInstance(6, 0, 0)) >= 0) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.EMOJISOURCES); + } + if (ucdVersion.compareTo(VersionInfo.getInstance(16, 0, 0)) >= 0) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.DONOTEMIT); + } + } + writer.endElement("ucd"); + } + writer.endFile(); + fileOutputStream.close(); + cleanUcdXMLFile(tempFile, destinationFile); + if (!tempFile.delete()) { + throw new IOException("Could not delete temporary file " + tempFile); + } + } + + private static void cleanUcdXMLFile(File tempFile, File destinationFile) throws IOException { + // XALAN writes out characters outside the BMP as entities. + // Use this code to replace the entities with the correct characters. + // See: https://issues.apache.org/jira/browse/XALANJ-2595 + + FileInputStream fileInputStream = new FileInputStream(tempFile); + FileOutputStream fileOutputStream = new FileOutputStream(destinationFile); + + InputStreamReader inputStreamReader = + new InputStreamReader(fileInputStream, StandardCharsets.UTF_8); + OutputStreamWriter outputStreamWriter = + new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); + + BufferedReader bufferedReader = new BufferedReader(inputStreamReader); + BufferedWriter bufferedWriter = new BufferedWriter(outputStreamWriter); + + String line; + while ((line = bufferedReader.readLine()) != null) { + Matcher matcher = Pattern.compile("&#(\\d+);").matcher(line); + line = + matcher.replaceAll( + matchResult -> + new String( + Character.toChars(Integer.parseInt(matcher.group(1))))); + bufferedWriter.append(line); + bufferedWriter.newLine(); + } + bufferedWriter.flush(); + fileInputStream.close(); + fileOutputStream.close(); + } + + private static void buildRepertoire( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws SAXException { + + writer.startElement("repertoire"); + { + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + codepoint = + buildGroup( + writer, + attributeResolver, + ucdVersion, + codepoint, + highCodepoint, + outputRange, + outputType); + } else { + codepoint = + buildChars( + writer, + attributeResolver, + ucdVersion, + codepoint, + highCodepoint, + outputRange, + outputType, + null); + } + } + } + writer.endElement("repertoire"); + } + } + + private static int buildGroup( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws SAXException { + + int lastCodepointInGroup = + getLastCodepointInGroup(attributeResolver, lowCodepoint, highCodepoint); + + AttributesImpl groupAttrs = + getGroupAttributes( + ucdVersion, + attributeResolver, + lowCodepoint, + lastCodepointInGroup, + outputRange); + + writer.startElement("group", groupAttrs); + { + buildChars( + writer, + attributeResolver, + ucdVersion, + lowCodepoint, + lastCodepointInGroup, + outputRange, + outputType, + groupAttrs); + writer.endElement("group"); + } + return lastCodepointInGroup; + } + + private static int buildChars( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType, + AttributesImpl groupAttrs) + throws SAXException { + + ArrayList range = new ArrayList<>(); + Range rangeType = Range.NONRANGE; + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (attributeResolver.isUnassignedCodepoint(codepoint) + || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN + && attributeResolver.isUnifiedIdeograph(codepoint))) { + Range currentRangeType = getRangeType(attributeResolver, codepoint); + if (!range.isEmpty()) { + if (!currentRangeType.equals(rangeType) + || attributeResolver.isDifferentRange( + ucdVersion, codepoint, codepoint - 1)) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, + attributeResolver, + ucdVersion, + range, + rangeType, + groupAttrs); + } else { + buildUngroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType); + } + } + range.clear(); + } + } + range.add(codepoint); + rangeType = currentRangeType; + } else { + if (!range.isEmpty()) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, + attributeResolver, + ucdVersion, + range, + rangeType, + groupAttrs); + } else { + buildUngroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType); + } + } + range.clear(); + rangeType = Range.NONRANGE; + } + if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedChar( + writer, + attributeResolver, + ucdVersion, + codepoint, + outputRange, + groupAttrs); + } else { + buildUngroupedChar( + writer, attributeResolver, ucdVersion, codepoint, outputRange); + } + } + } + } + // Handle any range before the end of the repertoire element. + if (!range.isEmpty()) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType, groupAttrs); + } else { + buildUngroupedRange(writer, attributeResolver, ucdVersion, range, rangeType); + } + } + } + return highCodepoint; + } + + private static void buildUngroupedChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int codepoint, + UCDXMLOUTPUTRANGE outputRange) + throws SAXException { + + AttributesImpl charAttributes = + getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); + buildChar(writer, attributeResolver, codepoint, charAttributes); + } + + private static void buildGroupedChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int codepoint, + UCDXMLOUTPUTRANGE outputRange, + AttributesImpl groupAttrs) + throws SAXException { + + AttributesImpl orgCharAttributes = + getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); + AttributesImpl charAttributes = new AttributesImpl(); + charAttributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + String qName = propDetail.getUcdProperty().getShortName(); + if (qName.startsWith("cjk")) { + qName = qName.substring(2); + } + String orgCharAttributesValue = orgCharAttributes.getValue(qName); + String groupAttributeValue = groupAttrs.getValue(qName); + if (!Objects.equals(orgCharAttributesValue, groupAttributeValue)) { + charAttributes.addAttribute( + NAMESPACE, + qName, + qName, + "CDATA", + Objects.requireNonNullElse(orgCharAttributesValue, "")); + } + } + buildChar(writer, attributeResolver, codepoint, charAttributes); + } + + private static void buildChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + int codepoint, + AttributesImpl charAttributes) + throws SAXException { + writer.startElement("char", charAttributes); + { + HashMap nameAliases = attributeResolver.getNameAliases(codepoint); + if (null != nameAliases && !nameAliases.isEmpty()) { + for (String alias : nameAliases.keySet()) { + AttributesImpl nameAliasAt = new AttributesImpl(); + nameAliasAt.addAttribute(NAMESPACE, "alias", "alias", "CDATA", alias); + String type = nameAliases.get(alias); + if (!Objects.equals(type, "none")) { + nameAliasAt.addAttribute( + NAMESPACE, "type", "type", "CDATA", nameAliases.get(alias)); + } + writer.startElement("name-alias", nameAliasAt); + { + writer.endElement("name-alias"); + } + } + } + writer.endElement("char"); + } + } + + private static void buildGroupedRange( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + ArrayList range, + Range rangeType, + AttributesImpl groupAttrs) + throws SAXException { + AttributesImpl orgRangeAttributes = + getReservedAttributes(ucdVersion, attributeResolver, range); + AttributesImpl rangeAttributes = new AttributesImpl(); + if (range.size() == 1) { + rangeAttributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(range.get(0))); + } else { + rangeAttributes.addAttribute( + NAMESPACE, + "first-cp", + "first-cp", + "CDATA", + attributeResolver.getHexString(range.get(0))); + rangeAttributes.addAttribute( + NAMESPACE, + "last-cp", + "last-cp", + "CDATA", + attributeResolver.getHexString(range.get(range.size() - 1))); + } + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + String qName = propDetail.getUcdProperty().getShortName(); + if (qName.startsWith("cjk")) { + qName = qName.substring(2); + } + String orgCharAttributesValue = orgRangeAttributes.getValue(qName); + String groupAttributeValue = groupAttrs.getValue(qName); + if (!Objects.equals(orgCharAttributesValue, groupAttributeValue)) { + rangeAttributes.addAttribute( + NAMESPACE, + qName, + qName, + "CDATA", + Objects.requireNonNullElse(orgCharAttributesValue, "")); + } + } + writer.startElement(rangeType.tag, rangeAttributes); + { + writer.endElement(rangeType.tag); + } + } + + private static void buildUngroupedRange( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + ArrayList range, + Range rangeType) + throws SAXException { + AttributesImpl rangeAttributes = + getReservedAttributes(ucdVersion, attributeResolver, range); + writer.startElement(rangeType.tag, rangeAttributes); + { + writer.endElement(rangeType.tag); + } + } + + private static boolean isWritableCodepoint( + int codepoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { + return outputRange == UCDXMLOUTPUTRANGE.ALL + || (outputRange == UCDXMLOUTPUTRANGE.UNIHAN + && attributeResolver.isUnihanAttributeRange(codepoint)) + || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN + && !attributeResolver.isUnifiedIdeograph(codepoint)); + } + + private static Range getRangeType(AttributeResolver attributeResolver, int codepoint) { + String NChar = attributeResolver.getNChar(codepoint); + UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(codepoint); + + if (attributeResolver.isUnihanAttributeRange(codepoint)) { + return Range.CJKUNIFIEDIDEOGRAPH; + } + if (gc.equals(UcdPropertyValues.General_Category_Values.Surrogate)) { + return Range.SURROGATE; + } + if (gc.equals(UcdPropertyValues.General_Category_Values.Private_Use)) { + return Range.CHARACTER; + } + if (NChar.equals(UcdPropertyValues.Binary.Yes.getShortName())) { + return Range.NONCHARACTER; + } + return Range.RESERVED; + } + + private static int getLastCodepointInGroup( + AttributeResolver attributeResolver, int lowCodepoint, int highCodepoint) { + String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodepoint); + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, codepoint))) { + return codepoint - 1; + } + if (codepoint == 0x20 - 1 // put the C0 controls in their own group + || codepoint == 0xa0 - 1 // put the C0 controls in their own group + || codepoint == 0x1160 - 1 // split the jamos into three groups + || codepoint == 0x11a8 - 1 // split the jamos into three groups + || codepoint == 0x1f1e6 - 1 // put the regional indicators in their own group + ) { + return codepoint; + } + } + return highCodepoint; + } + + private static AttributesImpl getAttributes( + VersionInfo version, + AttributeResolver attributeResolver, + int codepoint, + UCDXMLOUTPUTRANGE outputRange) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) < 0)) { + String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + boolean isAttributeIncluded = + getIsAttributeIncluded( + attrValue, + attributeResolver.isUnihanAttributeRange(codepoint), + propDetail, + prop, + outputRange); + if (isAttributeIncluded) { + String propName = prop.getShortName(); + if (propName.startsWith("cjk")) { + propName = propName.substring(2); + } + attributes.addAttribute(NAMESPACE, propName, propName, "CDATA", attrValue); + } + } + } + return attributes; + } + + private static AttributesImpl getGroupAttributes( + VersionInfo version, + AttributeResolver attributeResolver, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange) { + AttributesImpl attributes = new AttributesImpl(); + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) < 0)) { + int totalCount = 0; + Map counters = new LinkedHashMap<>(); + + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (!attributeResolver.isUnassignedCodepoint(codepoint)) { + String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + int currentCount = + (counters.get(attrValue) == null) ? 0 : counters.get(attrValue); + currentCount++; + totalCount++; + counters.put(attrValue, currentCount); + } + } + int max = Integer.MIN_VALUE; + String bestAttrValue = null; + for (String attrValue : counters.keySet()) { + int thisCount = counters.get(attrValue); + if (thisCount > max) { + max = thisCount; + bestAttrValue = attrValue; + } + } + switch (prop) { + case Decomposition_Mapping: + case Simple_Uppercase_Mapping: + case Simple_Lowercase_Mapping: + case Simple_Titlecase_Mapping: + case Uppercase_Mapping: + case Lowercase_Mapping: + case Titlecase_Mapping: + case Simple_Case_Folding: + case Case_Folding: + if (bestAttrValue != null) { + bestAttrValue = "#"; + } + } + if (max > 0.2 * totalCount && max > 1) { + boolean isAttributeIncluded = + getIsAttributeIncluded( + bestAttrValue, + attributeResolver.isUnihanAttributeRange(lowCodepoint), + propDetail, + prop, + outputRange); + if (isAttributeIncluded) { + String propName = prop.getShortName(); + if (propName.startsWith("cjk")) { + propName = propName.substring(2); + } + attributes.addAttribute( + NAMESPACE, propName, propName, "CDATA", bestAttrValue); + } + } + } + } + return attributes; + } + + private static boolean getIsAttributeIncluded( + String attrValue, + boolean isUnihanAttributeRange, + UcdPropertyDetail propDetail, + UcdProperty prop, + UCDXMLOUTPUTRANGE outputRange) { + if (attrValue == null) { + return false; + } + if (isUnihanAttributeRange) { + if (outputRange == UCDXMLOUTPUTRANGE.UNIHAN) { + if (prop.equals(UcdProperty.Numeric_Type) && !attrValue.equals("None")) { + return true; + } + if (prop.equals(UcdProperty.Numeric_Value) && !attrValue.equals("NaN")) { + return true; + } + return propDetail.isCJKAttribute() + && (propDetail.isCJKShowIfEmpty() || !attrValue.isEmpty()); + } + if (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN && propDetail.isCJKAttribute()) { + return false; + } + if (propDetail.isCJKShowIfEmpty()) { + return true; + } + } + if (propDetail.isBaseAttribute()) { + return true; + } + return !attrValue.isEmpty(); + } + + private static AttributesImpl getReservedAttributes( + VersionInfo version, AttributeResolver attributeResolver, ArrayList range) { + AttributesImpl attributes = new AttributesImpl(); + + if (range.size() == 1) { + attributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(range.get(0))); + } else { + attributes.addAttribute( + NAMESPACE, + "first-cp", + "first-cp", + "CDATA", + attributeResolver.getHexString(range.get(0))); + attributes.addAttribute( + NAMESPACE, + "last-cp", + "last-cp", + "CDATA", + attributeResolver.getHexString(range.get(range.size() - 1))); + } + for (UcdPropertyDetail propDetail : UcdPropertyDetail.baseValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) <= 0)) { + String attrValue = + attributeResolver.getAttributeValue( + propDetail.getUcdProperty(), range.get(0)); + + attributes.addAttribute( + NAMESPACE, prop.getShortName(), prop.getShortName(), "CDATA", attrValue); + } + } + return attributes; + } + + private static String getVersionString(VersionInfo version, int maxDigits) { + if (maxDigits >= 1 && maxDigits <= 4) { + int[] digits = + new int[] { + version.getMajor(), + version.getMinor(), + version.getMilli(), + version.getMicro() + }; + StringBuilder verStr = new StringBuilder(7); + verStr.append(digits[0]); + for (int i = 1; i < maxDigits; ++i) { + verStr.append("."); + verStr.append(digits[i]); + } + return verStr.toString(); + } else { + throw new IllegalArgumentException("Invalid maxDigits range"); + } + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java new file mode 100644 index 0000000000..396bddeb7f --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -0,0 +1,482 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.util.UnicodeMap; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.*; +import java.util.Map.Entry; +import org.unicode.cldr.util.XMLFileReader; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.text.utility.Utility; +import org.xml.sax.*; + +public class XMLProperties { + + enum XmlLeaf { + // Leaf + BLOCK, + BLOCKS, + CHAR, + CJK_RADICAL, + CJK_RADICALS, + DESCRIPTION, + DO_NOT_EMIT, + EMOJI_SOURCE, + EMOJI_SOURCES, + GROUP, + INSTEAD, + NAME_ALIAS, + NAMED_SEQUENCE, + NAMED_SEQUENCES, + NONCHARACTER, + NORMALIZATION_CORRECTION, + NORMALIZATION_CORRECTIONS, + PROVISIONAL_NAMED_SEQUENCES, + REPERTOIRE, + RESERVED, + STANDARDIZED_VARIANT, + STANDARDIZED_VARIANTS, + SURROGATE, + UCD; + static final XmlLeaf GREATEST_LEAF = NAME_ALIAS; + static final XmlLeaf GREATEST_BOTH = CHAR; + + static XmlLeaf forString(String source) { + try { + return XmlLeaf.valueOf(source.toUpperCase().replace('-', '_')); + } catch (final Exception e) { + return null; + } + } + } + + static class IntRange { + int start; + int end; + } + + Map> property2data = + new EnumMap>(UcdProperty.class); + + { + for (final UcdProperty prop : UcdProperty.values()) { + property2data.put(prop, new UnicodeMap()); + } + } + + Set leavesNotHandled = new LinkedHashSet(); + + public XMLProperties(File ucdxmlFile) { + readFile(ucdxmlFile); + + for (final UcdProperty prop : property2data.keySet()) { + final UnicodeMap map = property2data.get(prop); + map.freeze(); + } + } + + public void readFile(File ucdxmlFile) { + try { + System.out.println("Reading: " + ucdxmlFile.toString()); + final FileInputStream fis = new FileInputStream(ucdxmlFile); + final XMLReader xmlReader = XMLFileReader.createXMLReader(false); + xmlReader.setErrorHandler(new MyErrorHandler()); + xmlReader.setContentHandler(new MyContentHandler()); + final InputSource is = new InputSource(fis); + is.setSystemId(ucdxmlFile.toString()); + xmlReader.parse(is); + fis.close(); + } catch (final IOException | SAXException e) { + System.out.println("\t" + "Can't read " + ucdxmlFile); + System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); + } + } + + class MyContentHandler implements ContentHandler { + IntRange cp = new IntRange(); + HashMap attributes = new HashMap(); + HashMap groupAttributes = new HashMap(); + private final List lastElements = new ArrayList(); + + public MyContentHandler() {} + + @Override + public void characters(char[] arg0, int arg1, int arg2) throws SAXException { + final String chars = String.valueOf(arg0, arg1, arg2).trim(); + if (!chars.trim().isEmpty() + && lastElements.get(lastElements.size() - 1) != XmlLeaf.DESCRIPTION) { + throw new IllegalArgumentException("Should have no element content"); + } + } + + @Override + public void endElement(String arg0, String arg1, String arg2) throws SAXException { + try { + if (lastElements.isEmpty()) { + System.out.println( + "endElement: can't remove last element. Args: " + + arg0 + + ", " + + arg1 + + ", " + + arg2); + } else { + final XmlLeaf removed = lastElements.remove(lastElements.size() - 1); + } + } catch (ArrayIndexOutOfBoundsException e) { + throw new IllegalArgumentException( + "endElement: can't remove last element. Args: " + + arg0 + + ", " + + arg1 + + ", " + + arg2, + e); + } + } + + @Override + public void endDocument() throws SAXException {} + + @Override + public void endPrefixMapping(String arg0) throws SAXException {} + + @Override + public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {} + + @Override + public void processingInstruction(String arg0, String arg1) throws SAXException {} + + @Override + public void setDocumentLocator(Locator arg0) {} + + @Override + public void skippedEntity(String arg0) throws SAXException {} + + @Override + public void startDocument() throws SAXException {} + + @Override + public void startPrefixMapping(String arg0, String arg1) throws SAXException {} + + @Override + public void startElement( + String namespaceURI, String localName, String qName, Attributes atts) { + try { + final XmlLeaf xmlLeaf = XmlLeaf.forString(qName); + if (xmlLeaf == null) { + throw new IllegalArgumentException(qName); + } + lastElements.add(xmlLeaf); + // System.out.println("Added:\t" + lastElements); + + if (xmlLeaf == XmlLeaf.GROUP) { + groupAttributes.clear(); + addAttributes(atts, groupAttributes); + return; + } + attributes.clear(); + attributes.putAll(groupAttributes); + addAttributes(atts, attributes); + String cps; + switch (xmlLeaf) { + case CHAR: + case RESERVED: + case SURROGATE: + case NONCHARACTER: + parseCp(attributes); + for (final Entry entry : attributes.entrySet()) { + doAttributes(entry.getKey(), entry.getValue()); + } + if (xmlLeaf == XmlLeaf.NONCHARACTER) { + property2data + .get(UcdProperty.Noncharacter_Code_Point) + .putAll(cp.start, cp.end, "Yes"); + } + break; + case BLOCK: + parseCp(attributes); + property2data + .get(UcdProperty.Block) + .putAll(cp.start, cp.end, attributes.get("name")); + break; + case NAMED_SEQUENCE: + cps = Utility.fromHex(attributes.get("cps")); + property2data + .get(UcdProperty.Named_Sequences) + .put(cps, attributes.get("name")); + break; + case CJK_RADICAL: + final String number = attributes.get("number"); + setProp( + Utility.fromHex(attributes.get("radical")), + UcdProperty.CJK_Radical, + number); + setProp( + Utility.fromHex(attributes.get("ideograph")), + UcdProperty.CJK_Radical, + number); + break; + case EMOJI_SOURCE: + cps = Utility.fromHex(attributes.get("unicode")); + setProp(cps, UcdProperty.Emoji_DCM, attributes.get("docomo")); + setProp(cps, UcdProperty.Emoji_KDDI, attributes.get("kddi")); + setProp(cps, UcdProperty.Emoji_SB, attributes.get("softbank")); + break; + case REPERTOIRE: + case BLOCKS: + case CJK_RADICALS: + case EMOJI_SOURCES: + case NAMED_SEQUENCES: + case PROVISIONAL_NAMED_SEQUENCES: + case NORMALIZATION_CORRECTIONS: + case STANDARDIZED_VARIANTS: + case DESCRIPTION: + case DO_NOT_EMIT: + // non-informational nodes, skip + if (atts.getLength() != 0) { + throw new IllegalArgumentException("Has attributes"); + } + break; + case UCD: + if (atts.getLength() != 0) { + throw new IllegalArgumentException( + "Has wrong number of attributes: " + attributes.entrySet()); + } + break; + case NAME_ALIAS: + final String alias = + attributes.get("alias") + "(" + attributes.get("type") + ")"; + appendProp(cp.start, UcdProperty.Name_Alias, alias); + break; + case STANDARDIZED_VARIANT: + { + String desc = attributes.get("desc"); + final String when = attributes.get("when"); + if (!when.isEmpty()) { + desc = desc + "(" + when + ")"; + } + cps = Utility.fromHex(attributes.get("cps")); + appendProp(cps, UcdProperty.Standardized_Variant, desc); + break; + } + case NORMALIZATION_CORRECTION: + final String correction = + "old: " + + attributes.get("old") + + " new: " + + attributes.get("new") + + " version: " + + attributes.get("version"); + cps = Utility.fromHex(attributes.get("cp")); + appendProp(cps, UcdProperty.NC_Original, correction); + break; + case INSTEAD: + final String instead = + "use: " + + attributes.get("use") + + " because: " + + attributes.get("because"); + cps = attributes.get("of"); + appendProp(cps, UcdProperty.Do_Not_Emit_Preferred, instead); + break; + case GROUP: + break; // handled above. Leaving case for clarity + default: + leavesNotHandled.add(qName); + break; + } + } catch (final Exception e) { + System.out.println( + "Exception: " + + qName + + "\t" + + e.getClass().getName() + + "\t" + + e.getMessage()); + } + } + + public void addAttributes(Attributes atts, Map map) { + for (int i = 0; i < atts.getLength(); ++i) { + map.put(atts.getQName(i), atts.getValue(i)); + } + } + + public void setProp(String cps, UcdProperty ucdProperty, String docomo) { + if (docomo != null) { + property2data.get(ucdProperty).put(cps, docomo); + } + } + + public void setProp(int cps, UcdProperty ucdProperty, String docomo) { + if (docomo != null) { + property2data.get(ucdProperty).put(cps, docomo); + } + } + + public void appendProp(int cps, UcdProperty ucdProperty, String docomo) { + final UnicodeMap unicodeMap = property2data.get(ucdProperty); + final String former = unicodeMap.get(cps); + unicodeMap.put(cps, former == null ? docomo : former + "; " + docomo); + } + + public void appendProp(String cps, UcdProperty ucdProperty, String docomo) { + final UnicodeMap unicodeMap = property2data.get(ucdProperty); + final String former = unicodeMap.get(cps); + unicodeMap.put(cps, former == null ? docomo : former + "; " + docomo); + } + + public void parseCp(HashMap attributes2) { + final String cpString = attributes2.get("cp"); + if (cpString != null) { + cp.start = cp.end = Integer.parseInt(cpString, 16); + } else { + cp.start = Integer.parseInt(attributes2.get("first-cp"), 16); + cp.end = Integer.parseInt(attributes2.get("last-cp"), 16); + } + } + + public UnicodeMap doAttributes(String key, String value) { + UcdProperty prop = UcdProperty.forString(key); + // if (prop == UcdProperty.Deprecated && cp.start > 0xE0000 && cp.start < + // 0xE00FF) { + // System.out.println(Utility.hex(cp.start) + "," + Utility.hex(cp.end) + + // "\t" + key + "\t" + value); + // } + if (prop == null) { + if (key.endsWith("cp")) { + if (key.equals("cp") || key.equals("last-cp") || key.equals("first-cp")) { + return null; + } + } else if (key.equals("InSC")) { + prop = UcdProperty.Indic_Syllabic_Category; + } else if (key.equals("InMC")) { + prop = UcdProperty.Indic_Syllabic_Category; + } + if (prop == null) { + return null; + } + } + final UnicodeMap data = property2data.get(prop); + if (data == null) { + System.out.println("can't get data for " + key); + return null; + } + data.putAll(cp.start, cp.end, value.intern()); + return data; + } + } + + static class MyErrorHandler implements ErrorHandler { + @Override + public void error(SAXParseException exception) throws SAXException { + // System.out.println("\nerror: " + XMLFileReader.showSAX(exception)); + throw exception; + } + + @Override + public void fatalError(SAXParseException exception) throws SAXException { + // System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception)); + throw exception; + } + + @Override + public void warning(SAXParseException exception) throws SAXException { + // System.out.println("\nwarning: " + XMLFileReader.showSAX(exception)); + throw exception; + } + } + + public UnicodeMap getMap(UcdProperty prop) { + return property2data.get(prop); + } + + public Set getLeavesNotHandled() { + return leavesNotHandled; + } + + static String show(String ival) { + if (ival == null) { + return "null"; + } else if (ival.isEmpty()) { + return ""; + } else if (ival.codePointAt(0) < 0x20) { + return "\\u{" + Utility.hex(ival, 4) + "}"; + } + return "«" + ival + "»"; + } + + // private static final String NO_VALUE = + // IndexUnicodeProperties.DefaultValueType.NO_VALUE.toString(); + // private static final String NAN = IndexUnicodeProperties.DefaultValueType.NaN.toString(); + + static final boolean HACK_XML_DEFAULTS = false; + + public static String getXmlResolved(UcdProperty property, int codePoint, String propertyValue) { + if (property == UcdProperty.Name) { + int debug = 0; + } + switch (property.getType()) { + case Binary: + if (HACK_XML_DEFAULTS) { + if (propertyValue == null) { + propertyValue = "No"; + } else { + propertyValue = + IndexUnicodeProperties.normalizeValue(property, propertyValue); + } + break; + } + // $FALL-THROUGH$ + case Enumerated: + case Catalog: + if (propertyValue != null) { + propertyValue = IndexUnicodeProperties.normalizeValue(property, propertyValue); + } + break; + case Numeric: + // if (HACK_XML_DEFAULTS) { + // if (propertyValue == null || propertyValue.isEmpty()) { + // propertyValue = "NaN"; + // } + // } + switch (property) { + case kOtherNumeric: + case kPrimaryNumeric: + case kAccountingNumeric: + if (propertyValue == null || propertyValue.isEmpty()) { + propertyValue = "NaN"; + } + break; + } + break; + case Miscellaneous: + if (propertyValue != null) { + switch (property) { + case Script_Extensions: + propertyValue = + IndexUnicodeProperties.normalizeValue(property, propertyValue); + break; + // case Name: + // break; + default: + propertyValue = propertyValue.replace("#", Utility.hex(codePoint)); + } + } + break; + case String: + if (propertyValue != null) { + propertyValue = propertyValue.replace("#", Utility.hex(codePoint)); + propertyValue = Utility.fromHex(propertyValue); + } + break; + default: + break; + } + return propertyValue; + // return propertyValue == null ? "" : propertyValue; + } +} diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt index 80faee3c73..b18bdc4f27 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt @@ -66,6 +66,7 @@ CJKR ; CJK_Radical EDCM ; Emoji_DCM EKDDI ; Emoji_KDDI ESB ; Emoji_SB +EVS ; emoji_variation_sequence NS ; Named_Sequences NSP ; Named_Sequences_Prov SV ; Standardized_Variant @@ -149,6 +150,9 @@ cjkJoyoKanji ; kJoyoKanji cjkKoreanEducationHanja ; kKoreanEducationHanja cjkKoreanName ; kKoreanName cjkTGH ; kTGH +ncCorrected ; NC_Corrected +ncOriginal ; NC_Original +ncVersion ; NC_Version # 13.0 cjkSpoofingVariant ; kSpoofingVariant cjkTGHZ2013 ; kTGHZ2013 @@ -166,7 +170,7 @@ cjkVietnameseNumeric ; kVietnameseNumeric cjkZhuangNumeric ; kZhuangNumeric # 16.0 cjkFanqie ; kFanqie - +cjkZhuang ; kZhuang kTGT_MergedSrc ; kTGT_MergedSrc kRSTUnicode ; kRSTUnicode @@ -175,4 +179,4 @@ kReading ; kReading kEH_Func ; kEH_Func kEH_FVal ; kEH_FVal -kEH_UniK ; kEH_UniK \ No newline at end of file +kEH_UniK ; kEH_UniK diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt index 8d659c98f6..e82f33b4fd 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt @@ -85,7 +85,6 @@ # @missing: 0000..10FFFF; Emoji_Component ; No # @missing: 0000..10FFFF; Extended_Pictographic ; No -# @missing: 0000..10FFFF; kEH_Core ; No # @missing: 0000..10FFFF; kEH_NoMirror ; No # @missing: 0000..10FFFF; kEH_NoRotate ; No @@ -205,6 +204,11 @@ Do_Not_Emit_Type ; Preferred_Spelling ; Preferred_Spelling # @missing: 0000..10FFFF; kSrc_NushuDuben ; # @missing: 0000..10FFFF; kReading ; +# @missing: 0000..10FFFF; kEH_Core ; N +kEH_Core ; C ; Core +kEH_Core ; L ; Legacy +kEH_Core ; N ; None + # @missing: 0000..10FFFF; kEH_Func ; # @missing: 0000..10FFFF; kEH_FVal ; # @missing: 0000..10FFFF; kEH_UniK ; diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt index 5c8153d33b..70c52767ff 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt @@ -44,9 +44,11 @@ $codePoint0 = ($codePoints)? # Main data Bidi_Mirroring_Glyph ; SINGLE_VALUED ; $codePoint +Bidi_Paired_Bracket ; SINGLE_VALUED ; $codePoint Simple_Lowercase_Mapping ; SINGLE_VALUED ; $codePoint Simple_Titlecase_Mapping ; SINGLE_VALUED ; $codePoint Simple_Uppercase_Mapping ; SINGLE_VALUED ; $codePoint +Equivalent_Unified_Ideograph; SINGLE_VALUED ; $codePoint NFKC_Casefold ; SINGLE_VALUED ; $codePoint0 NFKC_Simple_Casefold ; SINGLE_VALUED ; $codePoint0 @@ -142,7 +144,7 @@ kHanYu ; MULTI_VALUED ; [1-8][0-9]{4}\.[0-3 kIRGHanyuDaZidian ; SINGLE_VALUED ; [1-8][0-9]{4}\.[0-3][0-9][01] kCNS1992 ; SINGLE_VALUED ; [1-9]-[0-9A-F]{4} kTotalStrokes ; ORDERED ; [1-9][0-9]{0,2} -kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2} +kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2} kRSJapanese ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} kRSKanWa ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} kRSKangXi ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} @@ -170,39 +172,48 @@ kHanyuPinlu ; MULTI_VALUED ; [a-z\x{308}]+[1-5]\ kCantonese ; MULTI_VALUED ; [a-z]{1,6}[1-6] kTang ; MULTI_VALUED ; \*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+ -kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})? -kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4}) +kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})? +kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4}) kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2} -kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})* -kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3} +kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})* +kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3} -kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} +kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} +kSrc_NushuDuben ; SINGLE_VALUED ; [0-9]+\.[0-9]+ +kReading ; SINGLE_VALUED ; [a-z]{1,6}[1-6]+ +kRSTUnicode ; SINGLE_VALUED ; [0-9]+\.[0-9]+ +kTGT_MergedSrc ; SINGLE_VALUED ; L2008-[0-9A-F]{4,5}(-[0-9]{4,5})? + +NC_Original ; SINGLE_VALUED ; [0-9A-F]{4,5} +NC_Corrected ; SINGLE_VALUED ; [0-9A-F]{4,5} +NC_Version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9] + # ============================= # Catalog/Enum/Binary Properties # All not listed are SINGLE_VALUED ; null # ============================= -Script_Extensions ; MULTI_VALUED ; -Standardized_Variant ; MULTI_VALUED ; .* +Script_Extensions ; MULTI_VALUED ; +Standardized_Variant ; MULTI_VALUED ; .* -Idn_Status ; SINGLE_VALUED ; -Idn_Mapping ; SINGLE_VALUED ; $codePoints -Idn_2008 ; SINGLE_VALUED ; +Idn_Status ; SINGLE_VALUED ; +Idn_Mapping ; SINGLE_VALUED ; $codePoints +Idn_2008 ; SINGLE_VALUED ; -Identifier_Status ; SINGLE_VALUED ; -Identifier_Type ; MULTI_VALUED ; +Identifier_Status ; SINGLE_VALUED ; +Identifier_Type ; MULTI_VALUED ; -Confusable_SL ; SINGLE_VALUED ; $codePoints -Confusable_SA ; SINGLE_VALUED ; $codePoints -Confusable_ML ; SINGLE_VALUED ; $codePoints -Confusable_MA ; SINGLE_VALUED ; $codePoints +Confusable_SL ; SINGLE_VALUED ; $codePoints +Confusable_SA ; SINGLE_VALUED ; $codePoints +Confusable_ML ; SINGLE_VALUED ; $codePoints +Confusable_MA ; SINGLE_VALUED ; $codePoints -#Emoji ; SINGLE_VALUED ; -#Emoji_Presentation ; SINGLE_VALUED ; -#Emoji_Modifier ; SINGLE_VALUED ; -#Emoji_Modifier_Base ; SINGLE_VALUED ; +#Emoji ; SINGLE_VALUED ; +#Emoji_Presentation ; SINGLE_VALUED ; +#Emoji_Modifier ; SINGLE_VALUED ; +#Emoji_Modifier_Base ; SINGLE_VALUED ; diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt index 018f9614dd..cc04636b1a 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt @@ -36,6 +36,8 @@ FileType ; Unihan_OtherMappings ; PropertyValue FileType ; Unihan_RadicalStrokeCounts ; PropertyValue FileType ; Unihan_Readings ; PropertyValue FileType ; Unihan_Variants ; PropertyValue +FileType ; NushuSources ; PropertyValue +FileType ; TangutSources ; PropertyValue # NameAliases File Type # Contains a multivalued property, where successive values are not in the same line, but are divided out on successive lines with the same code point @@ -43,6 +45,7 @@ FileType ; Unihan_Variants ; PropertyValue FileType ; NameAliases ; NameAliases FileType ; NameAliasesProv ; NameAliases FileType ; StandardizedVariants ; StandardizedVariants +FileType ; emoji-variation-sequences ; StandardizedVariants # CJKRadicals File Type @@ -104,6 +107,7 @@ DerivedAge ; Age EastAsianWidth ; East_Asian_Width HangulSyllableType ; Hangul_Syllable_Type IndicPositionalCategory ; Indic_Positional_Category +IndicMatraCategory ; Indic_Positional_Category ; v7.0 IndicSyllabicCategory ; Indic_Syllabic_Category Jamo ; Jamo_Short_Name LineBreak ; Line_Break @@ -309,6 +313,15 @@ Unihan_Variants ; kSpoofingVariant Unihan_Variants ; kTraditionalVariant Unihan_Variants ; kZVariant +NushuSources ; kSrc_NushuDuben +NushuSources ; kReading +TangutSources ; kRSTUnicode +TangutSources ; kTGT_MergedSrc + +NormalizationCorrections ; NC_Original +NormalizationCorrections ; NC_Corrected +NormalizationCorrections ; NC_Version + # Extras ScriptExtensions ; Script_Extensions @@ -319,6 +332,7 @@ EmojiSources ; Emoji_SB ; 3 NamedSequences ; Named_Sequences NamedSequencesProv ; Named_Sequences_Prov StandardizedVariants ; Standardized_Variant +emoji-variation-sequences ; emoji-variation-sequence DoNotEmit ; Do_Not_Emit_Preferred ; 1 DoNotEmit ; Do_Not_Emit_Type ; 2 @@ -369,15 +383,6 @@ emoji/*/emoji-zwj-sequences; RGI_Emoji_Zwj_Sequence #emoji/*/emoji-test ; Emoji_Short_Name - -FileType ; TangutSources ; PropertyValue -TangutSources ; kTGT_MergedSrc -TangutSources ; kRSTUnicode - -FileType ; NushuSources ; PropertyValue -NushuSources ; kSrc_NushuDuben -NushuSources ; kReading - FileType ; Unikemet ; PropertyValue Unikemet ; kEH_Cat Unikemet ; kEH_Core diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml new file mode 100644 index 0000000000..617113bf28 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Bidi_C { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml new file mode 100644 index 0000000000..c1380221b2 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Bidi_M { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml new file mode 100644 index 0000000000..7c78734594 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml @@ -0,0 +1,20 @@ + + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml new file mode 100644 index 0000000000..8340250dc3 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml @@ -0,0 +1,9 @@ + + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml new file mode 100644 index 0000000000..a7de623873 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml @@ -0,0 +1,21 @@ + + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml new file mode 100644 index 0000000000..ddddc27a4e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml @@ -0,0 +1,42 @@ + + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml new file mode 100644 index 0000000000..568f5e270c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml new file mode 100644 index 0000000000..4cbf1d0f0f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Join_C { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml new file mode 100644 index 0000000000..c2b53b2fef --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml @@ -0,0 +1,10 @@ + + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml new file mode 100644 index 0000000000..8919bba32e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml new file mode 100644 index 0000000000..a6ff2d0926 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml @@ -0,0 +1,8 @@ + + + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml new file mode 100644 index 0000000000..21e52208a5 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml @@ -0,0 +1,18 @@ + + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml new file mode 100644 index 0000000000..ba4c042f8d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml @@ -0,0 +1,347 @@ + + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml new file mode 100644 index 0000000000..8a1722f229 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml @@ -0,0 +1,23 @@ + + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml new file mode 100644 index 0000000000..d3e70a6abe --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml @@ -0,0 +1,17 @@ + + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml new file mode 100644 index 0000000000..ecd721a634 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml @@ -0,0 +1,344 @@ + + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml new file mode 100644 index 0000000000..1d9b2beb8b --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml new file mode 100644 index 0000000000..d4431070d5 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml new file mode 100644 index 0000000000..fae36d68db --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml @@ -0,0 +1,4 @@ + + + boolean = "Y" | "N" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml new file mode 100644 index 0000000000..abe4ffe9a0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml @@ -0,0 +1,58 @@ + + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml new file mode 100644 index 0000000000..3924ed3e9d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml new file mode 100644 index 0000000000..183c9bf3f1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml new file mode 100644 index 0000000000..8708699bee --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml new file mode 100644 index 0000000000..c1296b7b94 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml new file mode 100644 index 0000000000..df4b97e640 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml @@ -0,0 +1,32 @@ + + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml new file mode 100644 index 0000000000..503f059999 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml @@ -0,0 +1,14 @@ + + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml new file mode 100644 index 0000000000..8226509d71 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml new file mode 100644 index 0000000000..45c49ed2c1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml new file mode 100644 index 0000000000..08222c4f01 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= attribute cjkEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml new file mode 100644 index 0000000000..49f9c3917d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml @@ -0,0 +1,6 @@ + + + code-point-attributes &= attribute cjkIRG_TSource + { xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4} +| TU-[023][0-9A-F]{4}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml new file mode 100644 index 0000000000..96ce4abcf6 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml new file mode 100644 index 0000000000..c26367d970 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml @@ -0,0 +1,5 @@ + + + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml new file mode 100644 index 0000000000..c3cda88df1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml @@ -0,0 +1,9 @@ + + + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml new file mode 100644 index 0000000000..833a7d1e06 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml new file mode 100644 index 0000000000..97bb063e7d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml @@ -0,0 +1,6 @@ + + + + ucd.content &= + element description { text }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml new file mode 100644 index 0000000000..5381491e7f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml @@ -0,0 +1,22 @@ + + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml new file mode 100644 index 0000000000..d51bf24414 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml new file mode 100644 index 0000000000..96d122953e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml @@ -0,0 +1,11 @@ + + + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml new file mode 100644 index 0000000000..7ce510adc0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml @@ -0,0 +1,68 @@ + + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml new file mode 100644 index 0000000000..36cd1f7749 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml @@ -0,0 +1,12 @@ + + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml new file mode 100644 index 0000000000..11f3b0dd97 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml @@ -0,0 +1,8 @@ + + + + group = + element group { + code-point-attributes, + code-point* } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml new file mode 100644 index 0000000000..385cd466ab --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml new file mode 100644 index 0000000000..0ab95a27f0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml @@ -0,0 +1,26 @@ + + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml new file mode 100644 index 0000000000..0c758e3425 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml @@ -0,0 +1,23 @@ + + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml new file mode 100644 index 0000000000..f19b593171 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute isc { text }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml new file mode 100644 index 0000000000..9a6820c7b4 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml @@ -0,0 +1,5 @@ + + + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml new file mode 100644 index 0000000000..184fcca14d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml @@ -0,0 +1,53 @@ + + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml new file mode 100644 index 0000000000..ee1f36cac0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml @@ -0,0 +1,24 @@ + + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml new file mode 100644 index 0000000000..5dafe8c223 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml new file mode 100644 index 0000000000..4c4644c311 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml @@ -0,0 +1,13 @@ + + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml new file mode 100644 index 0000000000..592de98c37 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml new file mode 100644 index 0000000000..2859ea29d9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml @@ -0,0 +1,15 @@ + + + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml new file mode 100644 index 0000000000..e75306a26f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml @@ -0,0 +1,5 @@ + + + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml new file mode 100644 index 0000000000..7231a8c261 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml @@ -0,0 +1,11 @@ + + + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml new file mode 100644 index 0000000000..24230aee1a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml new file mode 100644 index 0000000000..baa00a73c7 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml new file mode 100644 index 0000000000..224c2287ea --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml @@ -0,0 +1,31 @@ + + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml new file mode 100644 index 0000000000..0cfc86e40a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml @@ -0,0 +1,6 @@ + + + + ucd.content &= + element repertoire { (code-point | group) + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml new file mode 100644 index 0000000000..cdfd1ad884 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml @@ -0,0 +1,23 @@ + + + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml new file mode 100644 index 0000000000..b22243aaf8 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml @@ -0,0 +1,49 @@ + + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml new file mode 100644 index 0000000000..e2acb669c2 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml new file mode 100644 index 0000000000..a415a1152a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml new file mode 100644 index 0000000000..ba0e2262fb --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml @@ -0,0 +1,6 @@ + + + + start = + element ucd { ucd.content } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index.xml b/unicodetools/src/main/resources/org/unicode/uax42/index.xml new file mode 100644 index 0000000000..6b4733a2b0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index.xml @@ -0,0 +1,1353 @@ + + +
+ + Unicode Character Database in XML + + + + + 2024 + + + + Wilcock + John + + + + + + New value for the age attribute: 16.0. + + New values for the blk attribute: Egyptian_Hieroglyphs_Ext_A, + Garay, Gurung_Khema, Kirat_Rai, Myanmar_Ext_C, + Ol_Onal, Sunuwar, Symbols_for_Legacy_Computing_Sup, + Todhri, Tulu_Tigalari. + + New values for the script attribute: Gara, Gukh, + Krai, Onao, Sunu, Todr, Tutg. + + New value for the jg attribute: Kashmiri_Yeh. + New value for the InSC attribute: Reordering_Killer. + + New attributes: MCM, kFanqie, kZhuang. + + Modified patterns for the cjk-radical/@number, kRSUnicode and + kIRG_GSource + attributes. + + Added the do-not-emit element. + + + + Revision 35 being a proposed update, only changes between revisions 34 and 36 are + noted here. + + + + New value for the age attribute: 15.1. + + New value for the blk attribute: CJK_Ext_I. + + New values for the lb attribute: AK, AP, + AS, VF, VI. + + Modified values for the number, radical attributes of the + cjk-radical + element. + + Changed single value into list for the nv code point attribute. + + New code point attributes: ID_Compat_Math_Continue, + ID_Compat_Math_Start, IDSU, NFKC_SCF, InCB. + + Modified patterns for the kBigFive, kIRG_GSource, + kMorohashi, kRSUnicode attributes. + + Changed single values into lists for the kMorohashi, kPrimaryNumeric + Unihan attributes. + + New Unihan attributes: kJapanese, kMojiJoho, + kSMSZD2003Index, kSMSZD2003Readings, kVietnameseNumeric, + kZhuangNumeric. + + + + Revision 33 being a proposed update, only changes between revisions 32 and 34 are + noted here. + + + + New value for the age attribute: 15.0. + + New values for the blk attribute: Arabic_Ext_C, CJK_Ext_H, + Cyrillic_Ext_D, Devanagari_Ext_A, Kaktovik_Numerals, Kawi, + Nag_Mundari. + + New values for the script attribute: Kawi, Nagm. + + New Unihan attribute: kAlternateTotalStrokes. + + Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_TSource, kSemanticVariant, kSpecializedSemanticVariant, + kZVariant + attributes. + + + + Revision 31 being a proposed update, only changes between revisions 30 and 32 are + noted here. + + + + New value for the age attribute: 14.0. + + New values for the blk attribute: Arabic_Ext_B, + Cypro_Minoan, Ethiopic_Ext_B, Kana_Ext_B, + Latin_Ext_F, Latin_Ext_G, Old_Uyghur, Tangsa, + Toto, UCAS_Ext_A, Vithkuqi, Znamenny_Music. + + New values for the script attribute: Cpmn, Ougr, + Tnsa, Toto, Vith. + + New values for the jg attribute: Thin_Yeh, Vertical_Tail. + + New Unihan attribute: kStrange. + + Modified patterns for the kIRG_GSource, kIRG_MSource, + kIRG_VSource, kPhonetic, kSpoofingVariant attributes. + + Removal of the kWubi attribute, which has never been present in + released versions of the UCD. + + + + Revision 29 being a proposed update, only changes between revisions 28 and 30 are + noted here. + + + + New value for the age attribute: 13.0. + + New values for the blk attribute: Chorasmian, CJK_Ext_G, + Dives_Akuru, Khitan_Small_Script, Lisu_Sup, + Symbols_For_Legacy_Computing, Tangut_Sup, Yezidi. + + New values for the script attribute: Chrs, Diak, + Kits, Yezi. + + New value for the InPC attribute: Top_And_Bottom_And_Left. + + New Unihan attributes kSpoofingVariant, kUnihanCore2020, + kIRG_SSource, kIRG_UKSource, kTGHZ2013. + + New Emoji attributes Emoji, EPres, EMod, + EBase, EComp, ExtPict. + + Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, kKangXi, + kSemanticVariant, kSimplifiedVariant, + kSpecializedSemanticVariant, kTraditionalVariant attributes. + + + + Revision 27 being a proposed update, only changes between revisions 26 and 28 are + noted here. + + + + New value for the age attribute: 12.1. + + + + + + New value for the age attribute: 12.0. + + New values for the script attribute: Elym, Hmnp, + Nand, Wcho. + + New values for the blk attribute: + Egyptian_Hieroglyph_Format_Controls, Elymaic, Nandinagari, + Nyiakeng_Puachue_Hmong, Ottoman_Siyaq_Numbers, Small_Kana_Ext, + Symbols_And_Pictographs_Ext_A, Tamil_Sup, Wancho. + + Modified patterns for the kIRG_GSource, kIRG_KSource, + kIRG_TSource, kTaiwanTelegraph attributes. + + + + Revision 24 being a proposed update, only changes between revisions 23 and 25 are + noted here. + + + + New value for the age attribute: 11.0. + + New values for the blk attribute: Chess_Symbols, + Dogra, Georgian_Ext, Gunjala_Gondi, + Hanifi_Rohingya, Indic_Siyaq_Numbers, Makasar, + Mayan_Numerals, Medefaidrin, Old_Sogdian, Sogdian. + + New values for the script attribute: Dogr, Gong, + Maka, Medf, Rohg, Sogd, Sogo. + + New values for the jg attribute: Hanifi_Rohingya_Kinna_Ya, + Hanifi_Rohingya_Pa. + + New value for the wb attribute: WSegSpace. + + New values for the InSC attribute: Consonant_Initial_Postfixed. + + New attributes: EqUIdeo, kJinmeiyoKanji, kJoyoKanji, + kKoreanEducationHanja, kKoreanName, kTGH. + + Modified patterns for the kTGT_MergedSrc attribute. + + Modified patterns for the kIRG_GSource, kIRG_HSource and + kIRG_VSource + attributes. + + + + Revision 22 being a proposed update, only changes between revisions 21 and 23 are + noted here. + + + + New value for the age attribute: 10.0. + + New values for the blk attribute: CJK_Ext_F, Kana_Ext_A, + Masaram_Gondi, Nushu, Soyombo, Syriac_Sup, + Zanabazar_Square. + + New values for the sc attribute: Gonm, Nshu, + Soyo, Zanb. + + New values for the jg attribute: Malayalam_Nga, + Malayalam_Ja, Malayalam_Nya, Malayalam_Tta, Malayalam_Nna, + Malayalam_Nnna, Malayalam_Bha, Malayalam_Ra, + Malayalam_Lla, Malayalam_Llla, Malayalam_Ssa. + + New value for the InPC attribute: Bottom_And_Left. + + Modified patterns for the kIRG_GSource, kIRG_JSource, + kIRG_KSource + attributes. + + New code point attributes: vo, + RI + + New code point attributes for Nushu data: kSrc_NushuDuben and + kReading. + + + + Revision 20 being a proposed update, only changes between revisions 19 and 21 are + noted here. + + + + New value for the age attribute: 9.0. + + New values for the sc attribute: Adlm, Bhks, + Marc, Newa, Osge, Tang. + + New values for the blk attribute: Adlam, Bhaiksuki, + Cyrillic_Ext_C, Glagolitic_Sup, Ideographic_Symbols, + Marchen, Mongolian_Sup, Newa, Osage, + Tangut, Tangut_Components. + + New values for the gcb attribute: EB, EBG, EM, + GAZ, ZWJ. + + New values for the wb attribute: EB, EBG, EM, + GAZ, ZWJ. + + New values for the lb attribute: EB, EM, ZWJ. + + New values for the jg attribute: African_Feh, + African_Noon, African_Qaf. + + New code point attributes: PCM, kRSTUnicode and + kTGT_MergedSrc. + + Modified patterns for the kRSUnicode, kRSKangXi, + kMandarin, kIRG_JSource, kIRG_USource and kFennIndex + attributes. + + + + Revision 18 being a proposed update, only changes between revisions 17 and 19 are + noted here. + + + + New value for the age attribute: 8.0. + + New values for the sc attribute: Ahom, Hatr, + Hluw, Hung, Mult, Sgnw. + + New values for the blk attribute: Ahom, + Anatolian_Hieroglyphs, Cherokee_Sup, CJK_Ext_E, + Early_Dynastic_Cuneiform, Hatran, Multani, Old_Hungarian, + Sup_Symbols_And_Pictographs, Sutton_SignWriting. + + New values for the InSC attribute: Consonant_Killer, + Consonant_Prefixed, Consonant_With_Stacker, Syllable_Modifier. + + New code point attributes: InPC, kJa. + + New patterns for the kIRG_GSource attribute: GFC-, GGFZ-. + + Switched the reference to ISO 19757 from :2003 and :2003 Amd1 to :2008. + + + Revision 16 being a proposed update, only changes between revisions 15 and 17 are + noted here. + + + + New value for the age attribute: 7.0. + + New values for the jg attribute. + + New values for the sc attribute. + + New values for the blk attribute. + + New values for the InSC attribute. + + New values for the kIICore attribute. + + New values for the kIRG_GSource attribute. + + + + Revision 14 being a proposed update, only changes between revisions 13 and 15 are + noted here. + + + + New value for the age attribute: 6.3. + + New values DQ, HL, SQ for the WB attribute(forUnicode6.3). + + New code point attributes bpt and bpb (for Unicode 6.3). + + New values for the bc attribute: LRI, RLI, FSI, + PDI + (for Unicode 6.3). + + Updated the patterns for kHanyuPinlu and kTotalStrokes (for + Unicode6.3). + + Updated the patterns for kIRG_HSource and kIRG_HSource (for + Unicode6.2). + + Clarified that the child elements list-like elements are in no particular order. + + + Revision 12 being a proposed update, only changes between revisions 11 and 13 are + noted here. + + + + New value for the age attribute: 6.2. + + New value for the gcb, wb and lb attributes: + RI + (for Unicode 6.2). + + Updated the patterns for kIRG_GSource and kIRG_HSource (for + Unicode 6.2). + + + + Revision 10 being a proposed update, only changes between revisions 9 and 11 are + noted here. + + + + Clarified the default values. + Indicate that property values may change from one release to the next. + Introduced the blk attributes, for the Block property. + + Introduced the scx attribute, for the ScriptExtensions property. + + Introduced the name-alias element, for the Name_Alias property. + + New value for the age attribute: 6.1. + + New values for the script attribute: Cakm, Merc, + Mero, Plrd, Shrd, Sora, Takr. + + New values for the lb attribute: HL and CJ. + + New value for the jg attribute: Rohingya_Yeh. + + The value of the fc_nfkc attribute must now be either # or + one-or-more-code-points. + + For the nv attribute, the absence of a numeric value is now represented by + NaN + rather than by the empty string. + + The values of the ccc are now restricted to 0..254, instead of 0..255. + + Updated the patterns for kSemanticVariant, + kSpecializedSemanticVariant, kIRG_USource, and kMandarin. + + + + Revision 8 being a proposed update, only changes between revisions 7 and 9 are noted + here. + + + + New value for the age attribute: 6.0. + + New value for the jg attribute: + Teh_Marbuta_Goal + + New values for the script attribute: Batk, Brah, + Mand. + + Updated the patterns for kIRG_GSource, kIRG_HSource, + kIRG_JSource, kIRG_KSource, kIRG_MSource, + kIRG_TSource, kIRG_VSource. + + Added the InSC and InMC elements. + + Added the emoji-sources element. + + + + Revision 6 being a proposed update, only changes between revisions 5 and 7 are noted + here. + + + + Changed the type of block/@first-cp, block/@last-cp and + normalization-corrections/@cp + from text to + single-code-point + + Changed the type of named-sequence/@cps, + provisional-named-sequences/@cps, normalization-correction/@old and + normalization-correction/@new + from text to one-or-more-code-points. + + Changed the type of standardized-variants/@cps from text to + two-code-points. + + New values for the jg attribute: Farsi_Yeh and Nya. + + New value for the age attribute: 5.2. + + New values for the sc attribute: Lana, Tavt, + Avst, Egyp, Samr, Lisu, Bamu, Java, + Mtei, Armi, Sarb, Prti, Phli, Orkh, + Kthi. + + New value for the lb attribute: CP. + + New value for the sc attribute: Zinh. + + New code point attributes CI, Cased, CWCF, + CWCM, CWL, CWKCF, CWT, CWU, + NFKC_CF. + + New attributes kHanyuPinyin and kIRG_MSource. + + New element + cjk-radicals + + Updated the patterns for kIRG_GSource, kIRG_JSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, + kIRG_VSource, kHanyuPinlu, kMandarin, + kSemanticVariant, kSpecializedSemanticVariant, + kVietnamese, kZVariant. + + Point out that Relax NG schemas do not modify or augment the infoset, and that it ispossible + to convert mechanically our schema to other schema languages. + + + + Revision 4 being a proposed update, only changes between revisions 3 and 5 are noted + here. + + + + First approved version, for Unicode 5.1.0. + For optional elements which acts as collections, such as repertoire and + named-sequences, impose that there be at least one element in the collection. + + Remove the constraint that the value jg is limited when jt has + certainvalues; similarly for bmg / Bidi_M and for nv / + nt. + + Value NL added to the WB attribute (for Unicode 5.1). + + Value PP added to the GCB attribute (for Unicode 5.1). + + Corrected the Vai script value to Vaii. + + Removed the discussion of elements or attributes in different namespace. + Removed the code-point element. + + + + + + Promoted to Draft UAX. + Changed the title from "An XML representation of the UCD" + Value 5.1 added to the age attribute (for Unicode 5.1). + + Value SM added to the gcb attribute (for Unicode 5.1). + + Values CR, Extend, LF, MB added to the + WB + attribute(forUnicode5.1). + + Values CR, EX, LF, SC added to the SB + attribute(forUnicode5.1). + + Value Burushaski_Yeh_Barree added to the jg attribute (for + Unicode5.1). + + Value Alef_Maqsurah added to the jg attribute (for Unicode 2.x). + + Values Cari, Cham, Kali, Lepc, + Lyci, Lydi, Olck, Rjng, Saur, Sund and + Vai + added to the sc attribute (forUnicode5.0). + + + jamo + attribute renamed to + JSN + + + sfc + attribute renamed to + scf + + Attribute kXHC1983 added (for Unicode 5.1.0). + + Pattern for attribute kIRG_USource extended (for Unicode 5.1.0). + + Element provisional-named-sequences added (for Unicode 5.0) + + + + + + First working draft. + + + + + + + This annex describes an XML representation of the Unicode Character Database. + + + + +
+ Introduction + In working on Unicode implementations, it is often useful to access the full content of the Unicode + Character Database (UCD). For example, in establishing mappings from characters to glyphs in fonts, it is + convenient to see the character scalar value, the character name, the character East Asian width, along with + the shape and metrics of the proposed glyph to map to; looking at all this data simultaneously helps in + evaluating the mapping. + + Directly accessing the data files that constitute the UCD is sometimes a daunting proposition. The data is + dispersed in a number of files of various formats, and there are just enough peculiarities (all justified by + the processing power available at the time the UCD representation was designed) to require a fairly intimate + knowledge of the data format itself, in addition to the meaning of the data. + + Many programming environments (for example, Java or ICU) do give access to the UCD. However, those + environments tend to lag behind releases of the standard, or support only some of the UCD content. + + Unibook is a wonderful tool to explore the UCD and in many cases is just the ticket; however, it is + difficult to use when the task at hand has not been built-in, or when non-UCD data is to be displayed as + well. + + This annex presents an alternative representation of the UCD, which is meant to overcome these + difficulties. We have chosen an XML representation, because parsing becomes a non-issue: there are a number + of XML parsers freely available, and using them is often fairly easy. In addition, there are freely + available tools that can perform powerful operations on XML data; for example, XPATH and XQUERY engines can + be thought of as a “grep” for XML data and XSLT engines can be thought of as + “awk” for XML data. + + It is important to note that we are interested in exploring the content of the UCD, rather than in using + the UCD data to process character streams. Thus, we are not concerned so much by the speed of processing or + the size of our representation. + + Our representation supports the creation of documents that represent only parts of the UCD, either by not + representing all the characters, or by not representing all the properties. This can be useful when only + some of the data is needed. + + This annex presents only the XML representation format of the UCD. The data itself is part of the Unicode + Character Database. + +
+ + + +
+ Overall schema + +
+ General principles + Our schema can be used to create and validate documents which are intended to represent properties of + Unicode code points, blocks, named sequences, normalization corrections, standardized variants, CJK + radicals and emoji sources. A document may represent the values actually assigned in a given version of + the UCD, or it may represent a draft version of the UCD, or a private agreement on Private Use + characters. The validity of a XML document with respect to the schema defined in this annex does not + assert anything about the correctness of the values. + + Valid documents may provide values for only some of the code points, or some of the Unicode + properties. Furthermore, they may also incorporate non-Unicode properties. + + Our schema is defined using English. However, a useful subset of the validity constraints can be + captured using a schema language, thereby simplifying the task of validating documents. We have chosen + Relax NG [ISO 19757], + in the compact syntax , as the schema language. It is important to stress that the schema which is + defined in English imposes more constraints on the documents than can be validated with the Relax NG + schema. + + An important characteristic of Relax NG is that its schemas do not modify or augment the infoset of + the documents. Therefore, it is possible to process our XML representation without using the schema. + Also, the schema is relatively straightforward and can be converted mechanically to other schema + languages. + + While our XML representation is not intended to be used during processing of characters and strings, + it is still a design principle for our schema to support the relatively efficient representation of the + UCD. This is achieved by an inheritance mechanism, similar to property inheritance in CSS or in XSL:FO + (see section 4.3 Group). + + Many invariants impose constraints on the values of the different properties for a given code point. + For example, if the value of the Numeric Type property is None, then the value of the + Numeric Value property should be the empty string; and if the value of the Other + Alphabetic property is true, then the value of the Alphabetic property should be + true. Those invariants are not captured in the schema. + +
+ + +
+ Namespace + The namespace for our elements is “http://www.unicode.org/ns/2003/ucd/1.0”. Our + attributes are in the empty namespace. + + + In all our examples, we assume that this namespace is the default one. + +
+ + +
+ Datatypes + We use a standard XML Schema datatypes: + + Characters are pervasive in the UCD, and will need to be represented. Representing characters directly + by themselves would seem the most obvious choice; for example, we could express that the decomposition + of U+00E8 is “&#x0065;&#x0300;”, that is have exactly two characters in (the + infoset of) the XML document. However, the current XML specification limits the set of characters + that can be part of a document. Another problem is that the various tools (XML parser, XPATH engine, + etc.) may equate U+00E8 with U+0065 U+0300, thus making it difficult to figure out which of the two + sequences is contained in the database (which is sometimes important for our purposes). Therefore, we + chose instead to represent characters by their code points; we follow the usual convention of four to + six hexadecimal digits (uppercase) and code points in a sequence separated by space; for example, the + decomposition of U+00E8 will be represented by the nine characters “0065 0300” in the + infoset. + + +
+ + +
+ Root Element + The root element of valid documents is a ucd. + + +
+ + +
+ Common attributes + A large number of properties are boolean. We uniformly use the values Y and + N for those: + + +
+ + +
+ Ordering of elements + In elements that hold lists of child elements, such as repertoire, + group, or standardized-variants, the schema does not require that the + child elements be in any particular order. + +
+
+ + +
+ Description + The root element may have a description child element, which in turn contains any string, + which is meant to describe what the XML document purports to describe. + + It is recommended that if the document purports to represent the UCD of some Unicode version, the + description be selected in accord with the rules listed in [Versions]; and + conversely, that documents which do not purport to represent the UCD be described as such. + + +
+ + +
+ Repertoire + The repertoire child element of the ucd element describes the code points and + their properties. As we will see shortly, code points can be described individually or as part of a group: + + + + +
+ Sets of code points + It is often the case that successive code points have the same property values, for a given set of + properties. The most striking example is that of an unallocated plane, where all but the last two + code points are reserved and have the same property values. Another example is the URO (U+4E00 + .. U+9FA5) where all the code points have the same property values if we ignore their name and their + Unihan properties. + + + This observation suggests that it is profitable to represent sets of code points which share the + same properties, rather than individual code points. To make the representation of the sets simple, + we restrict them to be segments in the code point space, that is a set is defined by the first and + last code point it contains. Those are captured by the attributes first-cp and + last-cp. The attribute cp is a shorthand notation for the case where the set + has a single code point. + + In the repertoire, there must be at most one code-point + element for a given code point. + +
+ + +
+ Code point types + When thinking about Unicode code points, it is useful to split them into four types: + + + those assigned to abstract characters (PUA or not) + the noncharacters + the surrogate code points + the reserved code points + + This leads to four elements to describe sets of code points: + + +
+ + +
+ Group + While we already recognized the situation where a set of code points have exactly the same set of + property values, another common situation is that of code points which have almost all the same + property values. + + For example, the characters U+1740 BUHID LETTER A .. U+1753 BUHID VOWEL SIGN U all have the age + “3.2”, and all have the script “Buhd”. On the one hand, it is convenient + to support data files in which those properties are explicitly listed with every code point, at this + makes answering questions like “what is the age of U+1749?” easier, because that data + is expressed right there. On the other hand, this leads to rather large data files, and it also tends + to obscure the differences between similar characters. + + + Our representation accounts for this situation with the notion of groups. A + group element is simply a container of code points that also holds default values for + the properties. If a code point inside a group does not list explicitly a property but the + group lists it, then the code point inherits that property from its + group. For example, the fragment with explicit properties: + + + <char cp="1740" age="3.2" na="BUHID LETTER A" gc="Lo" sc="Buhd"/> + <char cp="1741" age="3.2" na="BUHID LETTER I" gc="Lo" sc="Buhd"/> + <char cp="1752" age="3.2" na="BUHID VOWEL SIGN I" gc="Mn" sc="Buhd"/> + <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" gc="Lo" sc="Mong"/> + is equivalent to this fragment which uses a group: + + + <group age="3.2" gc="Lo" sc="Buhd"> + <char cp="1740" na="BUHID LETTER A"/> + <char cp="1741" na="BUHID LETTER I"/> + <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/> + <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/> + </group> + The element for U+1740 does not have the age attribute, and it therefore inherits it + from its enclosing group element, that is “3.2”. On the other hand, + the element for U+1820 does have this attribute, so the value is “3.0”. + + As this example illustrates, the notion of group does not necessarily align with the + notion of Unicode block. It is entirely defined and limited to our representation. In particular, the + value of a property for a code point can always be determined from the XML document alone, assuming + that this property and this code point are expressed at all. Of course, one may create an XML + representation where the groups happen to coincide with the Unicode blocks. + + Groups cannot be nested. The motivation for this limitation is to make the life of consumers + easier: either a property is defined by the element for a code point, or it is defined by the + immediately enclosing group element. + + +
+ + +
+ Properties + Each property, except for the Special_Case_Condition and Name_Alias + properties, is represented by an attribute. In an XML data file, the absence of an attribute (may be + only on some code-points) means that the document does not express the value + of the corresponding property. Conversely, the presence of an attribute is an expression of the + corresponding property value; the implied null value is represented by the empty string. + + The Name_Alias property is represented by zero or more name-alias child + elements. Unlike the situation for properties represented by attributes, it is not possible to determine + whether all the aliases have been represented in a data file by inspecting that data file. + + The name of an attribute is the abbreviated name of the property as given in the file + PropertyAliases.txt in the corresponding version of the UCD. For the Unihan + properties, the name is that given in the various versions of the Unihan database. + + For catalog and enumerated properties, the values are those listed in the file + PropertyValueAliases.txt in the corresponding version of the UCD; if there is an abbreviated + name, it is used, otherwise the long name is used. + + Note that the set of possible values for a property captured in this schema may change from one + version to the next. + + + +
+ Age property + The age attribute captures the version of Unicode in which a code point was + assigned to an abstract character, or made a surrogate or non-character. + + +
+ + +
+ Name properties + There are two name properties: the name given by the current version of the standard + (na), and possibly the name this character had in version 1.0 of the standard + (na1). + + + + The majority of the characters in Unicode have a name which is of the form CJK UNIFIED + IDEOGRAPH-<code point>. It also happens that character names cannot + contain the character U+0023 # NUMBER SIGN, so we adopted the following convention: if a + code point has the attribute na (either directly or by inheritance from an enclosing + group), then occurrences of the character # in the name are to be interpreted as the value of the + code point. For example: + + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/> + and + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/> + are equivalent. The # can be in any position in the value of the na + attribute. The convention also applies just as well to a set of multiple code points: + + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/> + <char cp="3401" na="CJK UNIFIED IDEOGRAPH-3401"/> + is equivalent to + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/> + <char cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/> + which in turn is equivalent to: + + <char first-cp="3400" last-cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/> +
+ + +
+ Name Alias properties + The Name_Alias property is represented by zero or more name-alias + child elements: + + +
+ + +
+ Block property + The Block property is represented by the blk attribute: + + +
+ + +
+ General Category + The general category is represented by the gc attribute. + + +
+ + +
+ Combining properties + The combining class is represented by the ccc attribute, which holds the decimal + representation of the combining class. + + Because the set of values that this property has taken across the various versions of the UCD + is rather large, our schema does not restrict the possible values to those actually used. + + +
+ + +
+ Bidirectionality properties + The bidirectional class is represented by the bc attribute. + + + The mirrored property is represented by the Bidi_M attribute, which takes a + boolean value. + + + The bmg attribute is the code point of a character whose glyph is typically + a mirrored image of the glyph for the current character. + + + Note that we do not express the “Best Fit” element recorded in BidiMirroring.txt. + For one thing, it is not meant to be machine readable. More importantly, the idea underlying the + mirrored glyph is delicate to use, since it makes assumptions about the design of the fonts, and + the best fit goes even farther. + + The Bidi_Control property is represented by the Bidi_C attribute. + + + The bidi paired bracket type and bidi paired bracket properties are represented by the + bpt and bpb attributes respectively. + + + +
+ + +
+ Decomposition properties + The decomposition type and decomposition mapping properties are represented by the dt + and dm attributes. + + Most characters have a decomposition mapping to themselves. This is very similar to the + situation we encountered with names, and we adopted a similar convention: if the value of a + decomposition mapping is the character itself, we use the attribute value # (U+0023 # + NUMBER SIGN) as a shorthand notation; this enables those attributes to be captured in groups. + + + The properties Composition_Exclusion and Full_Composition_Exclusion are + represented by the attributes CE and Comp_Ex: + + + The properties NFC_Quick_Check, NFD_Quick_Check, + NFKC_Quick_Check, NFKD_Quick_Check, Expands_On_NFC, + Expands_On_NFD, Expands_On_NFKC, Expands_On_NKFD, + FC_NFKC_Closure have corresponding attributes. + + +
+ + +
+ Numeric Properties + The numeric type is represented by the nt attribute. + + The numeric value is represented by the nv attribute, represented as a whole + number or a fraction. + + +
+ + +
+ Joining properties + The joining class of a character is represented by the jt attribute. + + The jg attribute is the joining group of the character. + + + The Join_Control property is represented by the Join_C attribute. + + +
+ + +
+ Linebreak properties + The Line_Break property is represented by the lb attribute. + + +
+ + +
+ East Asian Width property + The East Asian width property is represented by the ea attribute. + + +
+ + +
+ Case properties + The Uppercase, Lowercase, Other_Uppercase and + Other_Lowercase properties are represented by corresponding attributes. + + + Most characters have a case mapping and case folding properties that simply map or fold to + themselves. This is very similar to the situation we encountered with names, and we adopted a + similar convention: if the value of a case mapping or case folding property is the character + itself, we use the attribute value # (U+0023 # NUMBER SIGN) as a shorthand notation; this + enables those attributes to be captured in groups. + + The simple case mappings are recorded in the suc, slc, stc + attributes. + + + The non-simple casing are recorded in the uc, lc and tc + attributes. + + + The Simple_Case_Folding and Case_Folding properties are recorded in the + scf and cf attributes respectively. + + + The Case_Ignorable, Cased, Changes_When_Casefolded, + Changes_When_Casemapped, Changes_When_Lowercased, + Changes_When_NFKC_Casefolded, Changes_When_Titlecased, + Changes_When_Uppercased, NFKC_Casefold, and + NFKC_Simple_Casefold properties are recorded in these attributes: + + + Note that the UCD records more information about case folding than is expressed in the + properties, specifically the entries in CaseFolding.txt with status T. + +
+ + +
+ Script properties + The script and script extension properties are represented by the sc and + scx attributes respectively. + + +
+ + +
+ ISO Comment properties + The ISO 10646 comment field is represented by the isc attribute. + + +
+ + +
+ Hangul properties + The property Hangul_Syllable_Type is represented by the hst attribute. + + + The property Jamo_Short_Name is represented by the JSN attribute: + + +
+ + +
+ Indic properties + The property Indic_Syllabic_Category is represented by the InSC + attribute. + + + The property Indic_Positional_Category is represented by the InPC + attribute: + + + The property Indic_Conjunct_Break is represented by the InCB attribute: + + +
+ + +
+ Identifier and Pattern and programming language properties + + The properties ID_Start, Other_ID_Start, XID_Start, + ID_Continue, Other_ID_Continue, XID_Continue, + ID_Compat_Math_Start, and ID_Compat_Math_Continue are represented by + corresponding attributes: + + + The properties Pattern_Syntax and Pattern_White_Space are represented + by corresponding attributes: + + +
+ + +
+ Properties related to function and graphic characteristics + The properties Dash, Hyphen, Quotation_Mark, + Terminal_Punctuation, Sentence_Terminal, Diacritic, + Extender, Soft_Dotted, Alphabetic, + Other_Alphabetic, Math, Other_Math, Hex_Digit, + ASCII_Hex_Digit, Default_Ignorable_Code_Point, + Other_Default_Ignorable_Code_Point, Logical_Order_Exception, + Prepended_Concatenation_Mark, Modifier_Combining_Mark, + White_Space, Vertical_Orientation, and Regional_Indicator + describe the function or graphic characteristic of a character, and have each a corresponding + attribute. + + +
+ + +
+ Properties related to boundaries + The properties Grapheme_Base, Grapheme_Extend, + Other_Grapheme_Extend, Grapheme_Link, + Grapheme_Cluster_Break, Word_Break, and Sentence_Break each + have a corresponding attribute: + + +
+ + +
+ Properties related to ideographs + The properties Ideographic, Unified_Ideograph, + Equivalent_Unified_Ideograph, IDS_Binary_Operator, + IDS_Trinary_Operator, IDS_Unary_Operator, and Radical have + corresponding attributes: + + +
+ + +
+ Miscellaneous properties + The properties Deprecated, Variation_Selector, and + Noncharacter_Code_Point have corresponding attributes: + + +
+ + +
+ Unihan properties + The Unihan properties (from the Unihan database) are represented as attributes. + + +
+ + +
+ Tangut data + The Tangut data are represented as attributes. The attribute kRSTUnicode + represents the radical stroke index. The attribute kTGT_MergedSrc indicates the + source reference for the character. + + +
+ + +
+ Nushu data + The Nushu data are represented as attributes. The attribute kSrc_NushuDuben + indicates the page number and order of the item from the NushuDuben reference source. Nushu common + reading is represented as kReading. + +
+ + +
+ Emoji properties + The properties Emoji, EPres, EMod, EBase, + EComp, and ExtPict have corresponding attributes: + + +
+
+
+ + +
+ Blocks + The blocks child of the ucd describes the blocks. It has one child + block element per block, with attributes to describe the extent and name of the block. + + +
+ + +
+ Named Sequences + The named-sequences child of the ucd describes the named sequences. It has one + child named-sequence element per named sequence, with attributes to describe the name and + sequence. + + Similarly, the provisional-named-sequences child of the ucd describes the + provisional named sequences. + + +
+ + +
+ Normalization Corrections + The normalization-corrections child of the ucd describes the normalization + corrections. It has one child normalization-correction element per correction, with + attributes to describe the code point affected, its old normalization, its new normalization and the + version of Unicode in which the correction was made. + + +
+ + +
+ Standardized Variants + The standardized-variants child of the ucd describes the standardized + variant. It has one child element standardized-variant per variant. The attributes on that + last element capture the variation sequence, the description of the desired appearance, and the shaping + environment under which the appearance is different. + + +
+ + +
+ CJK Radicals + The cjk-radicals child of the ucd describes the CJK radicals. It has one + child element cjk-radical per radical. The attributes on that last element capture the + radical number, the corresponding CJK radical character, and the corresponding CJK unified ideograph. + + +
+ + +
+ Emoji sources + The emoji-sources child of the ucd describes the emoji sources. + + + + +
+ + +
+ Do Not Emit + The do-not-emit child of the ucd describes the + character sequences that should not be emitted or generated in newly authored texts. + + + +
+ + +
+ The full schema + Our schema is just the accumulation of the pieces we have described so far: + + + + + + + + + + + + + + + + + + + + + An expanded version is linked from the top of this document. +
+ + +
+ Examples + Here is a fragment of the UCD for a few representative + characters (only some of the properties are represented): + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + +
+ + + + Acknowledgments + Thanks to Markus Scherer and Mark Davis for their help developing this XML representation. Thanks to + the reviewers: Julie Allen, Ernest van den Boogaard, Daniel Bünzli, John Cowan, Asmus Freytag, + Felix Sasaki, Andrew West. Special thanks to Eric Muller and LaurenČ›iu Iancu. + + +
diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl b/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl new file mode 100644 index 0000000000..f0a95fa958 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl @@ -0,0 +1,611 @@ + + + + + + + + + + + + + + + + + + + + + <xsl:choose> + <xsl:when test="articleinfo/unicode:tr/@class='uax'"> + <xsl:text>UAX</xsl:text> + </xsl:when> + <xsl:when test="articleinfo/unicode:tr/@class='uts'"> + <xsl:text>UTS</xsl:text> + </xsl:when> + <xsl:when test="articleinfo/unicode:tr/@class='utr'"> + <xsl:text>UTR</xsl:text> + </xsl:when> + </xsl:choose> + <xsl:text> #</xsl:text> + <xsl:value-of select="articleinfo/unicode:tr/@number"/> + <xsl:text>: </xsl:text> + <xsl:value-of select="title"/> + + + + + + + + + + + + +
+ + [Unicode] +  Technical Reports +
 
+
+

+ + + + + Unicode® Standard Annex + + + Unicode® Technical Standard + + + Unicode® Technical Report + + + # + +

+

+ + +
+ + +
+ + + + + + +
+ +

Modifications

+

This section indicates the changes introduced by each revision.

+ +
+ +
+ + + + + Working draft + + + Proposed Update + + + + + + + + + + + + + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .html + + + + + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .html + + + + https://www.unicode.org/reports/tr + + / + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .rnc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Version + Unicode + + + + +
+ + + Editor + + + Editors + + + + +
Date + + + + +
This Version + + + + +
Previous Version + + + n/a + + + + + + + + +
Latest Version + +
Latest Proposed Update + proposed.html +
Schema + + + + +
Revision + + + + + + +
+
+ + + + + + + + + +
+
+ + + ( + mailto: + ) + + + + + +

Summary

+ +
+ + +

+
+ + + + +

Status

+ + +

This document has been reviewed by Unicode members and other interested parties, and has been + approved for publication by the Unicode Consortium. This is a stable document and may be used as reference + material or cited as a normative reference by other specifications.

+
+ +

+ + This is a draft document which may be updated, replaced, or + superseded by other documents at any time. Publication does not imply endorsement by the Unicode + Consortium. This is not a stable document; it is inappropriate to cite this document as other than a + work in progress.

+
+
+ + +
+

A Unicode Standard Annex (UAX) forms an integral part of the Unicode Standard, but is + published online as a separate document. The Unicode Standard may require conformance to normative + content in a Unicode Standard Annex, if so specified in the Conformance chapter of that version of the + Unicode Standard. The version number of a UAX document corresponds to the version of the Unicode Standard + of which it forms a part.

+
+

Please submit corrigenda and other comments with the online reporting form [Feedback]. Related information that is useful in + understanding this annex is found in Unicode Standard Annex #41, “Common References for Unicode Standard + Annexes.” For the latest version of the Unicode Standard, see [Unicode]. For a list of current Unicode + Technical Reports, see [Reports]. For more information about + versions of the Unicode Standard, see [Versions]. For any + errata which may apply to this annex, see [Errata].

+
+ +
+

A Unicode Technical Standard (UTS) is an independent specification. Conformance to the Unicode + Standard does not imply conformance to any UTS.

+
+

Please submit corrigenda and other comments with the online reporting form [ + Feedback]. Related information that is useful in understanding this document is found in References. For the latest version of the Unicode Standard see [Unicode]. For a list of current Unicode Technical Reports see [Reports]. For more information about versions of the Unicode Standard, see + [Versions].

+
+ +
+

A Unicode Technical Report (UTR) contains informative material. Conformance to the Unicode + Standard does not imply conformance to any UTR. Other specifications, however, are free to make normative + references to a UTR.

+
+

Please submit corrigenda and other comments with the online reporting form [ + Feedback]. Related information that is useful in understanding this document is found in References. For the latest version of the Unicode Standard see [Unicode]. For a list of current Unicode Technical Reports see [Reports]. For more information about versions of the Unicode Standard, see + [Versions].

+
+
+
+ + + + +

Contents

+ +
+ + +
  • + + +
      + +
    +
    +
  • +
    + + + + + + +      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    + + + +

    +
    + + +

    +
    + + +
    +
    + + + + + + + + + _blank + + + + + + + + + + + + + + + + + + + [ + + + + + + + + + + + + : + + + , ] + + + + +

    + [, + ] + + = + + + +

    +
    + + +

    + [] + + = + +

    +
    + + + + + + + + +
    +

    + Revision +

    + +
    +
    + + +
    +

    + +

    +
    +
    + + +
      + +
    +
    + + +
  • + +
  • +
    + + + + + + + + + + + + + + + + background-color: #ffff00; border-style:dotted; border-width:1px + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl b/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl new file mode 100644 index 0000000000..b7a8dfa819 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl @@ -0,0 +1,45 @@ + + + + + + + + + + + + # Copyright © Unicode, Inc. + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/output/index.html b/unicodetools/src/main/resources/org/unicode/uax42/output/index.html new file mode 100644 index 0000000000..13bf8181d1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/output/index.html @@ -0,0 +1,3482 @@ + + + + + + + UAX #42: Unicode Character Database in XML + + + + + + + + + + + +
    + + [Unicode] +  Technical Reports +
     
    +
    +

    + Proposed Update Unicode® Standard Annex #42

    +

    Unicode Character Database in XML

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    VersionUnicode 16.0.0 +
    + Editor + + John Wilcock
    +
    Date + 2024-08-15 +
    This Version + + https://www.unicode.org/reports/tr42/tr42-36.html + +
    Previous Version + + https://www.unicode.org/reports/tr42/tr42-34.html + +
    Latest Version + https://www.unicode.org/reports/tr42/ +
    Latest Proposed Update + https://www.unicode.org/reports/tr42/proposed.html +
    Schema + + https://www.unicode.org/reports/tr42/tr42-36.rnc + +
    Revision + + 36 + +
    +

    Summary

    +

    + This annex describes an XML representation of the Unicode Character Database. +

    +

    + Status +

    +

    + This is a + draft + document which may be updated, replaced, or + superseded by other documents at any time. Publication does not imply endorsement by the Unicode + Consortium. This is not a stable document; it is inappropriate to cite this document as other than a + work in progress. +

    +
    +

    + + A Unicode Standard Annex (UAX) forms an integral part of the Unicode Standard, but is + published online as a separate document. The Unicode Standard may require conformance to normative + content in a Unicode Standard Annex, if so specified in the Conformance chapter of that version of the + Unicode Standard. The version number of a UAX document corresponds to the version of the Unicode Standard + of which it forms a part. +

    +
    +

    + Please submit corrigenda and other comments with the online reporting form [Feedback]. Related information that is useful in + understanding this annex is found in Unicode Standard Annex #41, “Common References for Unicode Standard + Annexes.” For the latest version of the Unicode Standard, see [Unicode]. For a list of current Unicode + Technical Reports, see [Reports]. For more information about + versions of the Unicode Standard, see [Versions]. For any + errata which may apply to this annex, see [Errata]. +

    +

    Contents

    + +
    +

    + 1 Introduction +

    +

    In working on Unicode implementations, it is often useful to access the full content of the Unicode + Character Database (UCD). For example, in establishing mappings from characters to glyphs in fonts, it is + convenient to see the character scalar value, the character name, the character East Asian width, along with + the shape and metrics of the proposed glyph to map to; looking at all this data simultaneously helps in + evaluating the mapping. +

    +

    Directly accessing the data files that constitute the UCD is sometimes a daunting proposition. The data is + dispersed in a number of files of various formats, and there are just enough peculiarities (all justified by + the processing power available at the time the UCD representation was designed) to require a fairly intimate + knowledge of the data format itself, in addition to the meaning of the data. +

    +

    Many programming environments (for example, Java or ICU) do give access to the UCD. However, those + environments tend to lag behind releases of the standard, or support only some of the UCD content. +

    +

    Unibook is a wonderful tool to explore the UCD and in many cases is just the ticket; however, it is + difficult to use when the task at hand has not been built-in, or when non-UCD data is to be displayed as + well. +

    +

    This annex presents an alternative representation of the UCD, which is meant to overcome these + difficulties. We have chosen an XML representation, because parsing becomes a non-issue: there are a number + of XML parsers freely available, and using them is often fairly easy. In addition, there are freely + available tools that can perform powerful operations on XML data; for example, XPATH and XQUERY engines can + be thought of as a “grep” for XML data and XSLT engines can be thought of as + “awk” for XML data. +

    +

    It is important to note that we are interested in exploring the content of the UCD, rather than in using + the UCD data to process character streams. Thus, we are not concerned so much by the speed of processing or + the size of our representation. +

    +

    Our representation supports the creation of documents that represent only parts of the UCD, either by not + representing all the characters, or by not representing all the properties. This can be useful when only + some of the data is needed. +

    +

    This annex presents only the XML representation format of the UCD. The data itself is part of the Unicode + Character Database. +

    +

    + 2 Overall schema +

    +

    + 2.1 General principles +

    +

    Our schema can be used to create and validate documents which are intended to represent properties of + Unicode code points, blocks, named sequences, normalization corrections, standardized variants, CJK + radicals and emoji sources. A document may represent the values actually assigned in a given version of + the UCD, or it may represent a draft version of the UCD, or a private agreement on Private Use + characters. The validity of a XML document with respect to the schema defined in this annex does not + assert anything about the correctness of the values. +

    +

    Valid documents may provide values for only some of the code points, or some of the Unicode + properties. Furthermore, they may also incorporate non-Unicode properties. +

    +

    Our schema is defined using English. However, a useful subset of the validity constraints can be + captured using a schema language, thereby simplifying the task of validating documents. We have chosen + Relax NG [ISO 19757], + in the compact syntax , as the schema language. It is important to stress that the schema which is + defined in English imposes more constraints on the documents than can be validated with the Relax NG + schema. +

    +

    An important characteristic of Relax NG is that its schemas do not modify or augment the infoset of + the documents. Therefore, it is possible to process our XML representation without using the schema. + Also, the schema is relatively straightforward and can be converted mechanically to other schema + languages. +

    +

    While our XML representation is not intended to be used during processing of characters and strings, + it is still a design principle for our schema to support the relatively efficient representation of the + UCD. This is achieved by an inheritance mechanism, similar to property inheritance in CSS or in XSL:FO + (see section 4.3 Group). +

    +

    Many invariants impose constraints on the values of the different properties for a given code point. + For example, if the value of the Numeric Type property is None, then the value of the + Numeric Value property should be the empty string; and if the value of the Other + Alphabetic property is true, then the value of the Alphabetic property should be + true. Those invariants are not captured in the schema. +

    +

    + 2.2 Namespace +

    +

    The namespace for our elements is “http://www.unicode.org/ns/2003/ucd/1.0”. Our + attributes are in the empty namespace. +

    +

    + + [namespace declaration, + 1] + + = + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + +

    +

    In all our examples, we assume that this namespace is the default one. +

    +

    + 2.3 Datatypes +

    +

    We use a standard XML Schema datatypes:

    +

    + + [datatypes declaration, + 2] + + = + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + +

    +

    Characters are pervasive in the UCD, and will need to be represented. Representing characters directly + by themselves would seem the most obvious choice; for example, we could express that the decomposition + of U+00E8 is “&#x0065;&#x0300;”, that is have exactly two characters in (the + infoset of) the XML document. However, the current XML specification limits the set of characters + that can be part of a document. Another problem is that the various tools (XML parser, XPATH engine, + etc.) may equate U+00E8 with U+0065 U+0300, thus making it difficult to figure out which of the two + sequences is contained in the database (which is sometimes important for our purposes). Therefore, we + chose instead to represent characters by their code points; we follow the usual convention of four to + six hexadecimal digits (uppercase) and code points in a sequence separated by space; for example, the + decomposition of U+00E8 will be represented by the nine characters “0065 0300” in the + infoset. +

    +

    + + [datatype for code points, + 3] + + = + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + +

    +

    + 2.4 Root Element +

    +

    The root element of valid documents is a ucd. +

    +

    + + [schema start, + 4] + + = + + start = + element ucd { ucd.content } + +

    +

    + 2.5 Common attributes +

    +

    A large number of properties are boolean. We uniformly use the values Y and + N for those: +

    +

    + + [boolean, + 5] + + = + + boolean = "Y" | "N" + +

    +

    + 2.6 Ordering of elements +

    +

    In elements that hold lists of child elements, such as repertoire, + group, or standardized-variants, the schema does not require that the + child elements be in any particular order. +

    +

    + 3 Description +

    +

    The root element may have a description child element, which in turn contains any string, + which is meant to describe what the XML document purports to describe. +

    +

    It is recommended that if the document purports to represent the UCD of some Unicode version, the + description be selected in accord with the rules listed in [Versions]; and + conversely, that documents which do not purport to represent the UCD be described as such. +

    +

    + + [description, + 6] + + = + + ucd.content &= + element description { text }? + +

    +

    + 4 Repertoire +

    +

    The repertoire child element of the ucd element describes the code points and + their properties. As we will see shortly, code points can be described individually or as part of a group: +

    +

    + + [repertoire, + 7] + + = + + ucd.content &= + element repertoire { (code-point | group) + }? + +

    +

    + 4.1 Sets of code points +

    +

    It is often the case that successive code points have the same property values, for a given set of + properties. The most striking example is that of an unallocated plane, where all but the last two + code points are reserved and have the same property values. Another example is the URO (U+4E00 + .. U+9FA5) where all the code points have the same property values if we ignore their name and their + Unihan properties. +

    +

    + + [Set of code points, + 8] + + = + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + +

    +

    This observation suggests that it is profitable to represent sets of code points which share the + same properties, rather than individual code points. To make the representation of the sets simple, + we restrict them to be segments in the code point space, that is a set is defined by the first and + last code point it contains. Those are captured by the attributes first-cp and + last-cp. The attribute cp is a shorthand notation for the case where the set + has a single code point. +

    +

    In the repertoire, there must be at most one code-point + element for a given code point. +

    +

    + 4.2 Code point types +

    +

    When thinking about Unicode code points, it is useful to split them into four types: +

    + those assigned to abstract characters (PUA or not) + the noncharacters + the surrogate code points + the reserved code points +

    This leads to four elements to describe sets of code points: +

    +

    + + [Code points, + 9] + + = + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + +

    +

    + 4.3 Group +

    +

    While we already recognized the situation where a set of code points have exactly the same set of + property values, another common situation is that of code points which have almost all the same + property values. +

    +

    For example, the characters U+1740 BUHID LETTER A .. U+1753 BUHID VOWEL SIGN U all have the age + “3.2”, and all have the script “Buhd”. On the one hand, it is convenient + to support data files in which those properties are explicitly listed with every code point, at this + makes answering questions like “what is the age of U+1749?” easier, because that data + is expressed right there. On the other hand, this leads to rather large data files, and it also tends + to obscure the differences between similar characters. +

    +

    Our representation accounts for this situation with the notion of groups. A + group element is simply a container of code points that also holds default values for + the properties. If a code point inside a group does not list explicitly a property but the + group lists it, then the code point inherits that property from its + group. For example, the fragment with explicit properties: +

    +
    +    <char cp="1740" age="3.2" na="BUHID LETTER A" gc="Lo" sc="Buhd"/>
    +    <char cp="1741" age="3.2" na="BUHID LETTER I" gc="Lo" sc="Buhd"/>
    +    <char cp="1752" age="3.2" na="BUHID VOWEL SIGN I" gc="Mn" sc="Buhd"/>
    +    <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" gc="Lo" sc="Mong"/>
    +

    is equivalent to this fragment which uses a group: +

    +
    +    <group age="3.2" gc="Lo" sc="Buhd">
    +        <char cp="1740" na="BUHID LETTER A"/>
    +        <char cp="1741" na="BUHID LETTER I"/>
    +        <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/>
    +        <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/>
    +    </group>
    +

    The element for U+1740 does not have the age attribute, and it therefore inherits it + from its enclosing group element, that is “3.2”. On the other hand, + the element for U+1820 does have this attribute, so the value is “3.0”. +

    +

    As this example illustrates, the notion of group does not necessarily align with the + notion of Unicode block. It is entirely defined and limited to our representation. In particular, the + value of a property for a code point can always be determined from the XML document alone, assuming + that this property and this code point are expressed at all. Of course, one may create an XML + representation where the groups happen to coincide with the Unicode blocks. +

    +

    Groups cannot be nested. The motivation for this limitation is to make the life of consumers + easier: either a property is defined by the element for a code point, or it is defined by the + immediately enclosing group element. +

    +

    + + [groups, + 10] + + = + + group = + element group { + code-point-attributes, + code-point* } + +

    +

    + 4.4 Properties +

    +

    Each property, except for the Special_Case_Condition and Name_Alias + properties, is represented by an attribute. In an XML data file, the absence of an attribute (may be + only on some code-points) means that the document does not express the value + of the corresponding property. Conversely, the presence of an attribute is an expression of the + corresponding property value; the implied null value is represented by the empty string. +

    +

    The Name_Alias property is represented by zero or more name-alias child + elements. Unlike the situation for properties represented by attributes, it is not possible to determine + whether all the aliases have been represented in a data file by inspecting that data file. +

    +

    The name of an attribute is the abbreviated name of the property as given in the file + PropertyAliases.txt in the corresponding version of the UCD. For the Unihan + properties, the name is that given in the various versions of the Unihan database. +

    +

    For catalog and enumerated properties, the values are those listed in the file + PropertyValueAliases.txt in the corresponding version of the UCD; if there is an abbreviated + name, it is used, otherwise the long name is used. +

    +

    Note that the set of possible values for a property captured in this schema may change from one + version to the next. +

    +

    + 4.4.1 Age property +

    +

    The age attribute captures the version of Unicode in which a code point was + assigned to an abstract character, or made a surrogate or non-character. +

    +

    + + [age attribute, + 11] + + = + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + +

    +

    + 4.4.2 Name properties +

    +

    There are two name properties: the name given by the current version of the standard + (na), and possibly the name this character had in version 1.0 of the standard + (na1). +

    +

    + + [na attribute, + 12] + + = + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + +

    +

    + + [na1 attribute, + 13] + + = + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + +

    +

    The majority of the characters in Unicode have a name which is of the form CJK UNIFIED + IDEOGRAPH-<code point>. It also happens that character names cannot + contain the character U+0023 # NUMBER SIGN, so we adopted the following convention: if a + code point has the attribute na (either directly or by inheritance from an enclosing + group), then occurrences of the character # in the name are to be interpreted as the value of the + code point. For example: +

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/>
    +

    and

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    are equivalent. The # can be in any position in the value of the na + attribute. The convention also applies just as well to a set of multiple code points: +

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/>
    +    <char cp="3401" na="CJK UNIFIED IDEOGRAPH-3401"/>
    +

    is equivalent to

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/>
    +    <char cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    which in turn is equivalent to:

    +
    +    <char first-cp="3400" last-cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    + 4.4.3 Name Alias properties +

    +

    The Name_Alias property is represented by zero or more name-alias + child elements: +

    +

    + + [name-alias element, + 14] + + = + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + +

    +

    + 4.4.4 Block property +

    +

    The Block property is represented by the blk attribute: +

    +

    + + [blk attribute, + 15] + + = + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + +

    +

    + 4.4.5 General Category +

    +

    The general category is represented by the gc attribute. +

    +

    + + [gc attribute, + 16] + + = + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + +

    +

    + 4.4.6 Combining properties +

    +

    The combining class is represented by the ccc attribute, which holds the decimal + representation of the combining class. +

    +

    Because the set of values that this property has taken across the various versions of the UCD + is rather large, our schema does not restrict the possible values to those actually used. +

    +

    + + [ccc attribute, + 17] + + = + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + +

    +

    + 4.4.7 Bidirectionality properties +

    +

    The bidirectional class is represented by the bc attribute. +

    +

    + + [bc attribute, + 18] + + = + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + +

    +

    The mirrored property is represented by the Bidi_M attribute, which takes a + boolean value. +

    +

    + + [Bidi_M attribute, + 19] + + = + + code-point-attributes &= + attribute Bidi_M { boolean }? + +

    +

    The bmg attribute is the code point of a character whose glyph is typically + a mirrored image of the glyph for the current character. +

    +

    + + [bmg attribute, + 20] + + = + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + +

    +

    Note that we do not express the “Best Fit” element recorded in BidiMirroring.txt. + For one thing, it is not meant to be machine readable. More importantly, the idea underlying the + mirrored glyph is delicate to use, since it makes assumptions about the design of the fonts, and + the best fit goes even farther. +

    +

    The Bidi_Control property is represented by the Bidi_C attribute. +

    +

    + + [Bidi_C attribute, + 21] + + = + + code-point-attributes &= + attribute Bidi_C { boolean }? + +

    +

    The bidi paired bracket type and bidi paired bracket properties are represented by the + bpt and bpb attributes respectively. +

    +

    + + [bpt attribute, + 22] + + = + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + +

    +

    + + [bpb attribute, + 23] + + = + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + +

    +

    + 4.4.8 Decomposition properties +

    +

    The decomposition type and decomposition mapping properties are represented by the dt + and dm attributes. +

    +

    Most characters have a decomposition mapping to themselves. This is very similar to the + situation we encountered with names, and we adopted a similar convention: if the value of a + decomposition mapping is the character itself, we use the attribute value # (U+0023 # + NUMBER SIGN) as a shorthand notation; this enables those attributes to be captured in groups. +

    +

    + + [decomposition properties, + 24] + + = + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + +

    +

    The properties Composition_Exclusion and Full_Composition_Exclusion are + represented by the attributes CE and Comp_Ex: +

    +

    + + [composition properties, + 25] + + = + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + +

    +

    The properties NFC_Quick_Check, NFD_Quick_Check, + NFKC_Quick_Check, NFKD_Quick_Check, Expands_On_NFC, + Expands_On_NFD, Expands_On_NFKC, Expands_On_NKFD, + FC_NFKC_Closure have corresponding attributes. +

    +

    + + [quick check properties, + 26] + + = + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + +

    +

    + 4.4.9 Numeric Properties +

    +

    The numeric type is represented by the nt attribute. +

    +

    The numeric value is represented by the nv attribute, represented as a whole + number or a fraction. +

    +

    + + [numeric properties, + 27] + + = + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + +

    +

    + 4.4.10 Joining properties +

    +

    The joining class of a character is represented by the jt attribute. +

    +

    The jg attribute is the joining group of the character. +

    +

    + + [joining properties, + 28] + + = + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + +

    +

    The Join_Control property is represented by the Join_C attribute. +

    +

    + + [joining properties, + 29] + + = + + code-point-attributes &= + attribute Join_C { boolean }? + +

    +

    + 4.4.11 Linebreak properties +

    +

    The Line_Break property is represented by the lb attribute. +

    +

    + + [lb attribute, + 30] + + = + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + +

    +

    + 4.4.12 East Asian Width property +

    +

    The East Asian width property is represented by the ea attribute. +

    +

    + + [ea attribute, + 31] + + = + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + +

    +

    + 4.4.13 Case properties +

    +

    The Uppercase, Lowercase, Other_Uppercase and + Other_Lowercase properties are represented by corresponding attributes. +

    +

    + + [casing properties, + 32] + + = + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + +

    +

    Most characters have a case mapping and case folding properties that simply map or fold to + themselves. This is very similar to the situation we encountered with names, and we adopted a + similar convention: if the value of a case mapping or case folding property is the character + itself, we use the attribute value # (U+0023 # NUMBER SIGN) as a shorthand notation; this + enables those attributes to be captured in groups. +

    +

    The simple case mappings are recorded in the suc, slc, stc + attributes. +

    +

    + + [casing properties, + 33] + + = + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + +

    +

    The non-simple casing are recorded in the uc, lc and tc + attributes. +

    +

    + + [casing properties, + 34] + + = + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + +

    +

    The Simple_Case_Folding and Case_Folding properties are recorded in the + scf and cf attributes respectively. +

    +

    + + [casing properties, + 35] + + = + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + +

    +

    The Case_Ignorable, Cased, Changes_When_Casefolded, + Changes_When_Casemapped, Changes_When_Lowercased, + Changes_When_NFKC_Casefolded, Changes_When_Titlecased, + Changes_When_Uppercased, NFKC_Casefold, and + NFKC_Simple_Casefold properties are recorded in these attributes: +

    +

    + + [casing properties, + 36] + + = + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + +

    +

    Note that the UCD records more information about case folding than is expressed in the + properties, specifically the entries in CaseFolding.txt with status T. +

    +

    + 4.4.14 Script properties +

    +

    The script and script extension properties are represented by the sc and + scx attributes respectively. +

    +

    + + [script properties, + 37] + + = + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + +

    +

    + 4.4.15 ISO Comment properties +

    +

    The ISO 10646 comment field is represented by the isc attribute. +

    +

    + + [isc attribute, + 38] + + = + + code-point-attributes &= + attribute isc { text }? + +

    +

    + 4.4.16 Hangul properties +

    +

    The property Hangul_Syllable_Type is represented by the hst attribute. +

    +

    + + [hst attribute, + 39] + + = + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + +

    +

    The property Jamo_Short_Name is represented by the JSN attribute: +

    +

    + + [JSN attribute, + 40] + + = + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + +

    +

    + 4.4.17 Indic properties +

    +

    The property Indic_Syllabic_Category is represented by the InSC + attribute. +

    +

    + + [InSC attribute, + 41] + + = + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + +

    +

    The property Indic_Positional_Category is represented by the InPC + attribute: +

    +

    + + [InPC attribute, + 42] + + = + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + +

    +

    The property Indic_Conjunct_Break is represented by the InCB attribute: +

    +

    + + [InCB attribute, + 43] + + = + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + +

    +

    + 4.4.18 Identifier and Pattern and programming language properties +

    +

    The properties ID_Start, Other_ID_Start, XID_Start, + ID_Continue, Other_ID_Continue, XID_Continue, + ID_Compat_Math_Start, and ID_Compat_Math_Continue are represented by + corresponding attributes: +

    +

    + + [identifier properties, + 44] + + = + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + +

    +

    The properties Pattern_Syntax and Pattern_White_Space are represented + by corresponding attributes: +

    +

    + + [pattern properties, + 45] + + = + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + +

    +

    + 4.4.19 Properties related to function and graphic characteristics +

    +

    The properties Dash, Hyphen, Quotation_Mark, + Terminal_Punctuation, Sentence_Terminal, Diacritic, + Extender, Soft_Dotted, Alphabetic, + Other_Alphabetic, Math, Other_Math, Hex_Digit, + ASCII_Hex_Digit, Default_Ignorable_Code_Point, + Other_Default_Ignorable_Code_Point, Logical_Order_Exception, + Prepended_Concatenation_Mark, Modifier_Combining_Mark, + White_Space, Vertical_Orientation, and Regional_Indicator + describe the function or graphic characteristic of a character, and have each a corresponding + attribute. +

    +

    + + [properties related to function and graphic characteristics, + 46] + + = + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + +

    +

    + 4.4.20 Properties related to boundaries +

    +

    The properties Grapheme_Base, Grapheme_Extend, + Other_Grapheme_Extend, Grapheme_Link, + Grapheme_Cluster_Break, Word_Break, and Sentence_Break each + have a corresponding attribute: +

    +

    + + [properties related to boundaries, + 47] + + = + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + +

    +

    + 4.4.21 Properties related to ideographs +

    +

    The properties Ideographic, Unified_Ideograph, + Equivalent_Unified_Ideograph, IDS_Binary_Operator, + IDS_Trinary_Operator, IDS_Unary_Operator, and Radical have + corresponding attributes: +

    +

    + + [properties related to ideographs, + 48] + + = + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + +

    +

    + 4.4.22 Miscellaneous properties +

    +

    The properties Deprecated, Variation_Selector, and + Noncharacter_Code_Point have corresponding attributes: +

    +

    + + [miscellaneous properties, + 49] + + = + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + +

    +

    + 4.4.23 Unihan properties +

    +

    The Unihan properties (from the Unihan database) are represented as attributes. +

    +

    + + [Unihan properties, + 50] + + = + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + +

    +

    + 4.4.24 Tangut data +

    +

    The Tangut data are represented as attributes. The attribute kRSTUnicode + represents the radical stroke index. The attribute kTGT_MergedSrc indicates the + source reference for the character. +

    +

    + + [Tangut data, + 51] + + = + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + +

    +

    + 4.4.25 Nushu data +

    +

    The Nushu data are represented as attributes. The attribute kSrc_NushuDuben + indicates the page number and order of the item from the NushuDuben reference source. Nushu common + reading is represented as kReading.

    +

    + + [Nushu data, + 52] + + = + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + +

    +

    + 4.4.26 Emoji properties +

    +

    The properties Emoji, EPres, EMod, EBase, + EComp, and ExtPict have corresponding attributes: +

    +

    + + [Emoji properties, + 53] + + = + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + +

    +

    + 5 Blocks +

    +

    The blocks child of the ucd describes the blocks. It has one child + block element per block, with attributes to describe the extent and name of the block. +

    +

    + + [blocks, + 54] + + = + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + +

    +

    + 6 Named Sequences +

    +

    The named-sequences child of the ucd describes the named sequences. It has one + child named-sequence element per named sequence, with attributes to describe the name and + sequence. +

    +

    Similarly, the provisional-named-sequences child of the ucd describes the + provisional named sequences. +

    +

    + + [named sequences, + 55] + + = + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + +

    +

    + 7 Normalization Corrections +

    +

    The normalization-corrections child of the ucd describes the normalization + corrections. It has one child normalization-correction element per correction, with + attributes to describe the code point affected, its old normalization, its new normalization and the + version of Unicode in which the correction was made. +

    +

    + + [normalization corrections, + 56] + + = + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + +

    +

    + 8 Standardized Variants +

    +

    The standardized-variants child of the ucd describes the standardized + variant. It has one child element standardized-variant per variant. The attributes on that + last element capture the variation sequence, the description of the desired appearance, and the shaping + environment under which the appearance is different. +

    +

    + + [standardized variants, + 57] + + = + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + +

    +

    + 9 CJK Radicals +

    +

    The cjk-radicals child of the ucd describes the CJK radicals. It has one + child element cjk-radical per radical. The attributes on that last element capture the + radical number, the corresponding CJK radical character, and the corresponding CJK unified ideograph. +

    +

    + + [cjk radicals, + 58] + + = + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + +

    +

    + 10 Emoji sources +

    +

    The emoji-sources child of the ucd describes the emoji sources. +

    +

    + + [emoji sources, + 59] + + = + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + +

    +

    + + [datatype for code points, + 60] + + = + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + +

    +

    + 11 Do Not Emit +

    +

    + The do-not-emit child of the ucd describes the + character sequences that should not be emitted or generated in newly authored texts. + +

    +

    + + [do-not-emit, + 61] + + = + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + +

    +

    + 12 The full schema +

    +

    Our schema is just the accumulation of the pieces we have described so far: +

    +

    + + [UCD RelaxNG schema] + + = + + + [namespace declaration: 1] + + + [datatypes: 2, 3, 60] + + + [schema start: 4] + + + [boolean: 5] + + + [description: 6] + + + [repertoire: 7, 8, 9, 10] + + + [attributes: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] + + + [Tangut data: 51] + + + [Nushu data: 52] + + + [blocks: 54] + + + [named sequences: 55] + + + [normalization corrections: 56] + + + [standardized variants: 57] + + + [cjk radicals: 58] + + + [emoji sources: 59] + + + [Emoji properties: 53] + + + [do-not-emit: 61] + + +

    +

    An expanded version is linked from the top of this document.

    +

    + 13 Examples +

    +

    Here is a fragment of the UCD for a few representative + characters (only some of the properties are represented): +

    +
    +            
    +  <ucd xmlns="http://www.unicode.org/ns/2003/ucd/1.0">
    +    <repertoire>
    +      <char cp="001F" age="1.1" na="&lt;control&gt;" na1="UNIT SEPARATOR"
    +            gc="Cc" bc="S" lb="CM"/>
    +
    +      <char cp="0020" age="1.1" na="SPACE" gc="Zs" bc="WS" ea="Na" lb="SP"/>
    +
    +      <char cp="0026" age="1.1" na="AMPERSAND" gc="Po" bc="ON" ea="Na"/>
    +
    +      <char cp="0028" age="1.1" na="LEFT PARENTHESIS" na1="OPENING PARENTHESIS"
    +            gc="Ps" bc="ON" Bidi_M="y" bmg="0029" ea="Na" lb="OP"/>
    +
    +      <char cp="0041" age="1.1" na="LATIN CAPITAL LETTER A"
    +            gc="Lu" slc="0061" ea="Na" sc="Latn"/>
    +
    +      <char cp="AC00" age="2.0" na="HANGUL SYLLABLE GA" gc="Lo"
    +            dt="can" dm="1100 1161" ea="W" lb="ID" sc="Hang"/>
    +
    +      <char cp="20094" age="3.1" na="CJK UNIFIED IDEOGRAPH-20094"
    +            gc="Lo" ea="W" lb="ID" sc="Hani" kIRG_GSource="KX"
    +            kIRGHanyuDaZidian="10036.060" kIRG_TSource="5-214E"
    +           kRSUnicode="4.3" kIRGKangXi="0082.090"/>
    +
    +      <group age="3.2" gc="Lo" sc="Buhd">
    +        <char cp="1740" na="BUHID LETTER A"/>
    +        <char cp="1741" na="BUHID LETTER I"/>
    +        <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/>
    +        <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/>
    +      </group>
    +    </repertoire>
    +  </ucd>
    +
    +
    +

    + Acknowledgments +

    +

    Thanks to Markus Scherer and Mark Davis for their help developing this XML representation. Thanks to + the reviewers: Julie Allen, Ernest van den Boogaard, Daniel Bünzli, John Cowan, Asmus Freytag, + Felix Sasaki, Andrew West. Special thanks to Eric Muller and Laurențiu Iancu. +

    +

    + Modifications +

    +

    This section indicates the changes introduced by each revision.

    +
    +

    + Revision 36 +

    +
      +
    • New value for the age attribute: 16.0. +
    • +
    • New values for the blk attribute: Egyptian_Hieroglyphs_Ext_A, + Garay, Gurung_Khema, Kirat_Rai, Myanmar_Ext_C, + Ol_Onal, Sunuwar, Symbols_for_Legacy_Computing_Sup, + Todhri, Tulu_Tigalari. +
    • +
    • New values for the script attribute: Gara, Gukh, + Krai, Onao, Sunu, Todr, Tutg. +
    • +
    • New value for the jg attribute: Kashmiri_Yeh.
    • +
    • New value for the InSC attribute: Reordering_Killer. +
    • +
    • New attributes: MCM, kFanqie, kZhuang. +
    • +
    • Modified patterns for the cjk-radical/@number, kRSUnicode and + kIRG_GSource + attributes. +
    • +
    • Added the do-not-emit element. +
    • +
    +
    +
    +

    Revision 35 being a proposed update, only changes between revisions 34 and 36 are + noted here. +

    +
    +
    +

    + Revision 34 +

    +
      +
    • New value for the age attribute: 15.1. +
    • +
    • New value for the blk attribute: CJK_Ext_I. +
    • +
    • New values for the lb attribute: AK, AP, + AS, VF, VI. +
    • +
    • Modified values for the number, radical attributes of the + cjk-radical + element. +
    • +
    • Changed single value into list for the nv code point attribute. +
    • +
    • New code point attributes: ID_Compat_Math_Continue, + ID_Compat_Math_Start, IDSU, NFKC_SCF, InCB. +
    • +
    • Modified patterns for the kBigFive, kIRG_GSource, + kMorohashi, kRSUnicode attributes. +
    • +
    • Changed single values into lists for the kMorohashi, kPrimaryNumeric + Unihan attributes. +
    • +
    • New Unihan attributes: kJapanese, kMojiJoho, + kSMSZD2003Index, kSMSZD2003Readings, kVietnameseNumeric, + kZhuangNumeric. +
    • +
    +
    +
    +

    Revision 33 being a proposed update, only changes between revisions 32 and 34 are + noted here. +

    +
    +
    +

    + Revision 32 +

    +
      +
    • New value for the age attribute: 15.0. +
    • +
    • New values for the blk attribute: Arabic_Ext_C, CJK_Ext_H, + Cyrillic_Ext_D, Devanagari_Ext_A, Kaktovik_Numerals, Kawi, + Nag_Mundari. +
    • +
    • New values for the script attribute: Kawi, Nagm. +
    • +
    • New Unihan attribute: kAlternateTotalStrokes. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_TSource, kSemanticVariant, kSpecializedSemanticVariant, + kZVariant + attributes. +
    • +
    +
    +
    +

    Revision 31 being a proposed update, only changes between revisions 30 and 32 are + noted here. +

    +
    +
    +

    + Revision 30 +

    +
      +
    • New value for the age attribute: 14.0. +
    • +
    • New values for the blk attribute: Arabic_Ext_B, + Cypro_Minoan, Ethiopic_Ext_B, Kana_Ext_B, + Latin_Ext_F, Latin_Ext_G, Old_Uyghur, Tangsa, + Toto, UCAS_Ext_A, Vithkuqi, Znamenny_Music. +
    • +
    • New values for the script attribute: Cpmn, Ougr, + Tnsa, Toto, Vith. +
    • +
    • New values for the jg attribute: Thin_Yeh, Vertical_Tail. +
    • +
    • New Unihan attribute: kStrange. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_MSource, + kIRG_VSource, kPhonetic, kSpoofingVariant attributes. +
    • +
    • Removal of the kWubi attribute, which has never been present in + released versions of the UCD. +
    • +
    +
    +
    +

    Revision 29 being a proposed update, only changes between revisions 28 and 30 are + noted here. +

    +
    +
    +

    + Revision 28 +

    +
      +
    • New value for the age attribute: 13.0. +
    • +
    • New values for the blk attribute: Chorasmian, CJK_Ext_G, + Dives_Akuru, Khitan_Small_Script, Lisu_Sup, + Symbols_For_Legacy_Computing, Tangut_Sup, Yezidi. +
    • +
    • New values for the script attribute: Chrs, Diak, + Kits, Yezi. +
    • +
    • New value for the InPC attribute: Top_And_Bottom_And_Left. +
    • +
    • New Unihan attributes kSpoofingVariant, kUnihanCore2020, + kIRG_SSource, kIRG_UKSource, kTGHZ2013. +
    • +
    • New Emoji attributes Emoji, EPres, EMod, + EBase, EComp, ExtPict. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, kKangXi, + kSemanticVariant, kSimplifiedVariant, + kSpecializedSemanticVariant, kTraditionalVariant attributes. +
    • +
    +
    +
    +

    Revision 27 being a proposed update, only changes between revisions 26 and 28 are + noted here. +

    +
    +
    +

    + Revision 26 +

    +
      +
    • New value for the age attribute: 12.1. +
    • +
    +
    +
    +

    + Revision 25 +

    +
      +
    • New value for the age attribute: 12.0. +
    • +
    • New values for the script attribute: Elym, Hmnp, + Nand, Wcho. +
    • +
    • New values for the blk attribute: + Egyptian_Hieroglyph_Format_Controls, Elymaic, Nandinagari, + Nyiakeng_Puachue_Hmong, Ottoman_Siyaq_Numbers, Small_Kana_Ext, + Symbols_And_Pictographs_Ext_A, Tamil_Sup, Wancho. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_KSource, + kIRG_TSource, kTaiwanTelegraph attributes. +
    • +
    +
    +
    +

    Revision 24 being a proposed update, only changes between revisions 23 and 25 are + noted here. +

    +
    +
    +

    + Revision 23 +

    +
      +
    • New value for the age attribute: 11.0. +
    • +
    • New values for the blk attribute: Chess_Symbols, + Dogra, Georgian_Ext, Gunjala_Gondi, + Hanifi_Rohingya, Indic_Siyaq_Numbers, Makasar, + Mayan_Numerals, Medefaidrin, Old_Sogdian, Sogdian. +
    • +
    • New values for the script attribute: Dogr, Gong, + Maka, Medf, Rohg, Sogd, Sogo. +
    • +
    • New values for the jg attribute: Hanifi_Rohingya_Kinna_Ya, + Hanifi_Rohingya_Pa. +
    • +
    • New value for the wb attribute: WSegSpace. +
    • +
    • New values for the InSC attribute: Consonant_Initial_Postfixed. +
    • +
    • New attributes: EqUIdeo, kJinmeiyoKanji, kJoyoKanji, + kKoreanEducationHanja, kKoreanName, kTGH. +
    • +
    • Modified patterns for the kTGT_MergedSrc attribute. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource and + kIRG_VSource + attributes. +
    • +
    +
    +
    +

    Revision 22 being a proposed update, only changes between revisions 21 and 23 are + noted here. +

    +
    +
    +

    + Revision 21 +

    +
      +
    • New value for the age attribute: 10.0. +
    • +
    • New values for the blk attribute: CJK_Ext_F, Kana_Ext_A, + Masaram_Gondi, Nushu, Soyombo, Syriac_Sup, + Zanabazar_Square. +
    • +
    • New values for the sc attribute: Gonm, Nshu, + Soyo, Zanb. +
    • +
    • New values for the jg attribute: Malayalam_Nga, + Malayalam_Ja, Malayalam_Nya, Malayalam_Tta, Malayalam_Nna, + Malayalam_Nnna, Malayalam_Bha, Malayalam_Ra, + Malayalam_Lla, Malayalam_Llla, Malayalam_Ssa. +
    • +
    • New value for the InPC attribute: Bottom_And_Left. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_JSource, + kIRG_KSource + attributes. +
    • +
    • New code point attributes: vo, + RI +
    • +
    • New code point attributes for Nushu data: kSrc_NushuDuben and + kReading. +
    • +
    +
    +
    +

    Revision 20 being a proposed update, only changes between revisions 19 and 21 are + noted here. +

    +
    +
    +

    + Revision 19 +

    +
      +
    • New value for the age attribute: 9.0. +
    • +
    • New values for the sc attribute: Adlm, Bhks, + Marc, Newa, Osge, Tang. +
    • +
    • New values for the blk attribute: Adlam, Bhaiksuki, + Cyrillic_Ext_C, Glagolitic_Sup, Ideographic_Symbols, + Marchen, Mongolian_Sup, Newa, Osage, + Tangut, Tangut_Components. +
    • +
    • New values for the gcb attribute: EB, EBG, EM, + GAZ, ZWJ. +
    • +
    • New values for the wb attribute: EB, EBG, EM, + GAZ, ZWJ. +
    • +
    • New values for the lb attribute: EB, EM, ZWJ. +
    • +
    • New values for the jg attribute: African_Feh, + African_Noon, African_Qaf. +
    • +
    • New code point attributes: PCM, kRSTUnicode and + kTGT_MergedSrc. +
    • +
    • Modified patterns for the kRSUnicode, kRSKangXi, + kMandarin, kIRG_JSource, kIRG_USource and kFennIndex + attributes. +
    • +
    +
    +
    +

    Revision 18 being a proposed update, only changes between revisions 17 and 19 are + noted here. +

    +
    +
    +

    + Revision 17 +

    +
      +
    • New value for the age attribute: 8.0. +
    • +
    • New values for the sc attribute: Ahom, Hatr, + Hluw, Hung, Mult, Sgnw. +
    • +
    • New values for the blk attribute: Ahom, + Anatolian_Hieroglyphs, Cherokee_Sup, CJK_Ext_E, + Early_Dynastic_Cuneiform, Hatran, Multani, Old_Hungarian, + Sup_Symbols_And_Pictographs, Sutton_SignWriting. +
    • +
    • New values for the InSC attribute: Consonant_Killer, + Consonant_Prefixed, Consonant_With_Stacker, Syllable_Modifier. +
    • +
    • New code point attributes: InPC, kJa. +
    • +
    • New patterns for the kIRG_GSource attribute: GFC-, GGFZ-. +
    • +
    • Switched the reference to ISO 19757 from :2003 and :2003 Amd1 to :2008.
    • +
    +
    +
    +

    Revision 16 being a proposed update, only changes between revisions 15 and 17 are + noted here. +

    +
    +
    +

    + Revision 15 +

    +
      +
    • New value for the age attribute: 7.0. +
    • +
    • New values for the jg attribute. +
    • +
    • New values for the sc attribute. +
    • +
    • New values for the blk attribute. +
    • +
    • New values for the InSC attribute. +
    • +
    • New values for the kIICore attribute. +
    • +
    • New values for the kIRG_GSource attribute. +
    • +
    +
    +
    +

    Revision 14 being a proposed update, only changes between revisions 13 and 15 are + noted here. +

    +
    +
    +

    + Revision 13 +

    +
      +
    • New value for the age attribute: 6.3. +
    • +
    • New values DQ, HL, SQ for the WB attribute(forUnicode6.3). +
    • +
    • New code point attributes bpt and bpb (for Unicode 6.3). +
    • +
    • New values for the bc attribute: LRI, RLI, FSI, + PDI + (for Unicode 6.3). +
    • +
    • Updated the patterns for kHanyuPinlu and kTotalStrokes (for + Unicode6.3). +
    • +
    • Updated the patterns for kIRG_HSource and kIRG_HSource (for + Unicode6.2). +
    • +
    • Clarified that the child elements list-like elements are in no particular order.
    • +
    +
    +
    +

    Revision 12 being a proposed update, only changes between revisions 11 and 13 are + noted here. +

    +
    +
    +

    + Revision 11 +

    +
      +
    • New value for the age attribute: 6.2. +
    • +
    • New value for the gcb, wb and lb attributes: + RI + (for Unicode 6.2). +
    • +
    • Updated the patterns for kIRG_GSource and kIRG_HSource (for + Unicode 6.2). +
    • +
    +
    +
    +

    Revision 10 being a proposed update, only changes between revisions 9 and 11 are + noted here. +

    +
    +
    +

    + Revision 9 +

    +
      +
    • Clarified the default values.
    • +
    • Indicate that property values may change from one release to the next.
    • +
    • Introduced the blk attributes, for the Block property. +
    • +
    • Introduced the scx attribute, for the ScriptExtensions property. +
    • +
    • Introduced the name-alias element, for the Name_Alias property. +
    • +
    • New value for the age attribute: 6.1. +
    • +
    • New values for the script attribute: Cakm, Merc, + Mero, Plrd, Shrd, Sora, Takr. +
    • +
    • New values for the lb attribute: HL and CJ. +
    • +
    • New value for the jg attribute: Rohingya_Yeh. +
    • +
    • The value of the fc_nfkc attribute must now be either # or + one-or-more-code-points. +
    • +
    • For the nv attribute, the absence of a numeric value is now represented by + NaN + rather than by the empty string. +
    • +
    • The values of the ccc are now restricted to 0..254, instead of 0..255. +
    • +
    • Updated the patterns for kSemanticVariant, + kSpecializedSemanticVariant, kIRG_USource, and kMandarin. +
    • +
    +
    +
    +

    Revision 8 being a proposed update, only changes between revisions 7 and 9 are noted + here. +

    +
    +
    +

    + Revision 7 +

    +
      +
    • New value for the age attribute: 6.0. +
    • +
    • New value for the jg attribute: + Teh_Marbuta_Goal +
    • +
    • New values for the script attribute: Batk, Brah, + Mand. +
    • +
    • Updated the patterns for kIRG_GSource, kIRG_HSource, + kIRG_JSource, kIRG_KSource, kIRG_MSource, + kIRG_TSource, kIRG_VSource. +
    • +
    • Added the InSC and InMC elements. +
    • +
    • Added the emoji-sources element. +
    • +
    +
    +
    +

    Revision 6 being a proposed update, only changes between revisions 5 and 7 are noted + here. +

    +
    +
    +

    + Revision 5 +

    +
      +
    • Changed the type of block/@first-cp, block/@last-cp and + normalization-corrections/@cp + from text to + single-code-point +
    • +
    • Changed the type of named-sequence/@cps, + provisional-named-sequences/@cps, normalization-correction/@old and + normalization-correction/@new + from text to one-or-more-code-points. +
    • +
    • Changed the type of standardized-variants/@cps from text to + two-code-points. +
    • +
    • New values for the jg attribute: Farsi_Yeh and Nya. +
    • +
    • New value for the age attribute: 5.2. +
    • +
    • New values for the sc attribute: Lana, Tavt, + Avst, Egyp, Samr, Lisu, Bamu, Java, + Mtei, Armi, Sarb, Prti, Phli, Orkh, + Kthi. +
    • +
    • New value for the lb attribute: CP. +
    • +
    • New value for the sc attribute: Zinh. +
    • +
    • New code point attributes CI, Cased, CWCF, + CWCM, CWL, CWKCF, CWT, CWU, + NFKC_CF. +
    • +
    • New attributes kHanyuPinyin and kIRG_MSource. +
    • +
    • New element + cjk-radicals +
    • +
    • Updated the patterns for kIRG_GSource, kIRG_JSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, + kIRG_VSource, kHanyuPinlu, kMandarin, + kSemanticVariant, kSpecializedSemanticVariant, + kVietnamese, kZVariant. +
    • +
    • Point out that Relax NG schemas do not modify or augment the infoset, and that it ispossible + to convert mechanically our schema to other schema languages. +
    • +
    +
    +
    +

    Revision 4 being a proposed update, only changes between revisions 3 and 5 are noted + here. +

    +
    +
    +

    + Revision 3 +

    +
      +
    • First approved version, for Unicode 5.1.0.
    • +
    • For optional elements which acts as collections, such as repertoire and + named-sequences, impose that there be at least one element in the collection. +
    • +
    • Remove the constraint that the value jg is limited when jt has + certainvalues; similarly for bmg / Bidi_M and for nv / + nt. +
    • +
    • Value NL added to the WB attribute (for Unicode 5.1). +
    • +
    • Value PP added to the GCB attribute (for Unicode 5.1). +
    • +
    • Corrected the Vai script value to Vaii. +
    • +
    • Removed the discussion of elements or attributes in different namespace.
    • +
    • Removed the code-point element. +
    • +
    +
    +
    +

    + Revision 2 +

    +
      +
    • Promoted to Draft UAX.
    • +
    • Changed the title from "An XML representation of the UCD"
    • +
    • Value 5.1 added to the age attribute (for Unicode 5.1). +
    • +
    • Value SM added to the gcb attribute (for Unicode 5.1). +
    • +
    • Values CR, Extend, LF, MB added to the + WB + attribute(forUnicode5.1). +
    • +
    • Values CR, EX, LF, SC added to the SB + attribute(forUnicode5.1). +
    • +
    • Value Burushaski_Yeh_Barree added to the jg attribute (for + Unicode5.1). +
    • +
    • Value Alef_Maqsurah added to the jg attribute (for Unicode 2.x). +
    • +
    • Values Cari, Cham, Kali, Lepc, + Lyci, Lydi, Olck, Rjng, Saur, Sund and + Vai + added to the sc attribute (forUnicode5.0). +
    • +
    • + jamo + attribute renamed to + JSN +
    • +
    • + sfc + attribute renamed to + scf +
    • +
    • Attribute kXHC1983 added (for Unicode 5.1.0). +
    • +
    • Pattern for attribute kIRG_USource extended (for Unicode 5.1.0). +
    • +
    • Element provisional-named-sequences added (for Unicode 5.0) +
    • +
    +
    +
    +

    + Revision 1 +

    +
      +
    • First working draft.
    • +
    +
    +
    + + + +
    + + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc b/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc new file mode 100644 index 0000000000..84d9b5875c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc @@ -0,0 +1,1455 @@ + + # Copyright © 2024 Unicode, Inc. + + + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + + + start = + element ucd { ucd.content } + + + boolean = "Y" | "N" + + + ucd.content &= + element description { text }? + + + ucd.content &= + element repertoire { (code-point | group) + }? + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + + group = + element group { + code-point-attributes, + code-point* } + + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + + code-point-attributes &= + attribute Bidi_M { boolean }? + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + + code-point-attributes &= + attribute Bidi_C { boolean }? + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + + code-point-attributes &= + attribute Join_C { boolean }? + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + + code-point-attributes &= + attribute isc { text }? + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/pom.xml b/unicodetools/src/main/resources/org/unicode/uax42/pom.xml new file mode 100644 index 0000000000..9ae81d56f9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/pom.xml @@ -0,0 +1,72 @@ + + + + 4.0.0 + + uax42 + Unicode Standard Annex #42 + + + + org.unicode.unicodetools + unicodetools-parent + 1.0.0 + + + + + + org.codehaus.mojo + xml-maven-plugin + 1.1.0 + + + + transform + + + + + + + ${project.basedir} + true + + index.xml + + index2html.xsl + ${outputdir} + + + .html + + + + + ${project.basedir} + true + + index.xml + + index2rnc.xsl + ${outputdir} + + + .rnc + + + + + + + + net.sf.saxon + Saxon-HE + 12.4 + + + + + + + From a8d3f5b9104bad6c9f31be4adccd990c0216cd20 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Fri, 7 Feb 2025 14:44:16 -0800 Subject: [PATCH 02/10] Review changes from Markus --- docs/ucdxml.md | 4 +- .../org/unicode/xml/AttributeResolver.java | 51 +- ...{CompareUcdXML.java => CompareUCDXML.java} | 11 +- .../unicode/xml/GeneratePropertyValues.java | 20 +- .../java/org/unicode/xml/UCDDataResolver.java | 17 +- ...ertyDetail.java => UCDPropertyDetail.java} | 1068 +++++++++-------- ...omponent.java => UCDSectionComponent.java} | 7 +- ...ctionDetail.java => UCDSectionDetail.java} | 87 +- .../unicode/xml/{UcdXML.java => UCDXML.java} | 208 ++-- .../java/org/unicode/xml/UCDXMLWriter.java | 9 +- .../java/org/unicode/xml/XMLProperties.java | 16 +- .../resources/org/unicode/uax42/index.xml | 4 +- 12 files changed, 795 insertions(+), 707 deletions(-) rename unicodetools/src/main/java/org/unicode/xml/{CompareUcdXML.java => CompareUCDXML.java} (96%) rename unicodetools/src/main/java/org/unicode/xml/{UcdPropertyDetail.java => UCDPropertyDetail.java} (70%) rename unicodetools/src/main/java/org/unicode/xml/{UcdSectionComponent.java => UCDSectionComponent.java} (75%) rename unicodetools/src/main/java/org/unicode/xml/{UcdSectionDetail.java => UCDSectionDetail.java} (72%) rename unicodetools/src/main/java/org/unicode/xml/{UcdXML.java => UCDXML.java} (85%) diff --git a/docs/ucdxml.md b/docs/ucdxml.md index 207842db2a..a8d1d1e954 100644 --- a/docs/ucdxml.md +++ b/docs/ucdxml.md @@ -10,8 +10,8 @@ ## Step 3 - Validate generated UAX XML files -You'll need a [RELAX NG](https://relaxng.org/) schema validator. We'll use [jing-trang](https://github. -com/relaxng/jing-trang) in this example. +You'll need a [RELAX NG](https://relaxng.org/) schema validator. +We'll use [jing-trang](https://github.com/relaxng/jing-trang) in this example. 1. Clone and build [jing-trang](https://github.com/relaxng/jing-trang) 2. Run the following: diff --git a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java index 393bb32815..0ef4f17515 100644 --- a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java +++ b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java @@ -2,9 +2,24 @@ import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.util.VersionInfo; -import java.util.*; import org.unicode.cldr.draft.FileUtilities; -import org.unicode.props.*; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdLineParser; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.unicode.props.UnicodeProperty; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; + +/** + * Used by UCDXML to get string values of attributes for each code point from IndexUnicodeProperties. + */ public class AttributeResolver { @@ -19,14 +34,14 @@ public class AttributeResolver { // If there is a change in any of these properties between two adjacent characters, it will // result in a new range. - private final UcdPropertyDetail[] rangeDefiningPropertyDetails = { - UcdPropertyDetail.Age_Detail, - UcdPropertyDetail.Bidi_Class_Detail, - UcdPropertyDetail.Block_Detail, - UcdPropertyDetail.Decomposition_Mapping_Detail, - UcdPropertyDetail.Numeric_Type_Detail, - UcdPropertyDetail.Numeric_Value_Detail, - UcdPropertyDetail.Vertical_Orientation_Detail + private final UCDPropertyDetail[] rangeDefiningPropertyDetails = { + UCDPropertyDetail.Age_Detail, + UCDPropertyDetail.Bidi_Class_Detail, + UCDPropertyDetail.Block_Detail, + UCDPropertyDetail.Decomposition_Mapping_Detail, + UCDPropertyDetail.Numeric_Type_Detail, + UCDPropertyDetail.Numeric_Value_Detail, + UCDPropertyDetail.Vertical_Orientation_Detail }; public AttributeResolver(IndexUnicodeProperties iup) { @@ -93,7 +108,7 @@ public int compare(NameAlias o1, NameAlias o2) { } private HashMap> loadNameAliases() { - HashMap> nameAliasesByCodepoint = new HashMap<>(); + HashMap> nameAliasesByCodePoint = new HashMap<>(); final PropertyParsingInfo fileInfo = PropertyParsingInfo.getPropertyInfo(UcdProperty.Name_Alias); String fullFilename = fileInfo.getFullFileName(indexUnicodeProperties.getUcdVersion()); @@ -112,17 +127,17 @@ private HashMap> loadNameAliases() { parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT))); } - if (nameAliasesByCodepoint.containsKey(codepoint)) { + if (nameAliasesByCodePoint.containsKey(codepoint)) { LinkedList nameAliases = - new LinkedList<>(nameAliasesByCodepoint.get(codepoint)); + new LinkedList<>(nameAliasesByCodePoint.get(codepoint)); nameAliases.add(nameAlias); nameAliases.sort(nameAliasComparator); - nameAliasesByCodepoint.replace(codepoint, nameAliases); + nameAliasesByCodePoint.replace(codepoint, nameAliases); } else { - nameAliasesByCodepoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); + nameAliasesByCodePoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); } } - return nameAliasesByCodepoint; + return nameAliasesByCodePoint; } public String getAttributeValue(UcdProperty prop, int codepoint) { @@ -254,7 +269,7 @@ public String getAttributeValue(UcdProperty prop, int codepoint) { } } - public boolean isUnassignedCodepoint(int codepoint) { + public boolean isUnassignedCodePoint(int codepoint) { return UcdPropertyValues.General_Category_Values.Unassigned.equals(getgc(codepoint)) || UcdPropertyValues.General_Category_Values.Private_Use.equals(getgc(codepoint)) || UcdPropertyValues.General_Category_Values.Surrogate.equals(getgc(codepoint)); @@ -300,7 +315,7 @@ private String getMappingValue( public boolean isDifferentRange(VersionInfo ucdVersion, int codepointA, int codepointB) { boolean isDifference = false; - for (UcdPropertyDetail propDetail : rangeDefiningPropertyDetails) { + for (UCDPropertyDetail propDetail : rangeDefiningPropertyDetails) { UcdProperty prop = propDetail.getUcdProperty(); if (ucdVersion.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null diff --git a/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java similarity index 96% rename from unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java rename to unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java index 52d3421e23..26c280cca3 100644 --- a/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java @@ -3,12 +3,19 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.UnicodeSet; -import java.io.*; + +import java.io.File; +import java.io.IOException; import java.util.HashMap; import java.util.Objects; import org.unicode.props.UcdProperty; -public class CompareUcdXML { +/** + * Utility for comparing two UCDXML files. + * Originally intended to compare UCDXML files generated using https://github.com/eric-muller/ucdxml to UCDXML files + * generated using org.unicode.xml.UCDXML. + */ +public class CompareUCDXML { private static final String NEWLINE = System.getProperty("line.separator"); private static final UOption[] options = { diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java index f8a0dfa279..03f10a428b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -2,17 +2,31 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.util.VersionInfo; -import java.io.*; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.unicode.props.PropertyParsingInfo; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.*; +/** + * Utility for generating fragments that describe the property values in a format that can be displayed in UAX42. + * UAX42 fragments live in unicodetools/src/main/resources/org/unicode/uax42/fragments + */ public class GeneratePropertyValues { private enum VALUESOUTPUTTYPE { @@ -669,7 +683,7 @@ private static String getFormattedTR38Syntax(UcdProperty ucdProperty) { // TODO: We should determine whether we still want to show empty values in the XML files. // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() boolean isShowIfEmpty = false; - for (UcdPropertyDetail propDetail : UcdPropertyDetail.cjkValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.cjkValues()) { if (propDetail.getUcdProperty().equals(ucdProperty)) { isShowIfEmpty = propDetail.isCJKShowIfEmpty(); } diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java index a30067bbb6..d30693e838 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java @@ -1,7 +1,6 @@ package org.unicode.xml; import com.ibm.icu.util.VersionInfo; -import java.util.*; import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyParsingInfo; @@ -9,6 +8,14 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; + +/** + * Helper class for building sections of UCDXML files based on IndexUnicodeProperties values. + */ public class UCDDataResolver { private final IndexUnicodeProperties indexUnicodeProperties; @@ -21,20 +28,20 @@ public UCDDataResolver(IndexUnicodeProperties iup, String namespace, UCDXMLWrite this.writer = writer; } - public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXException { + public void buildSection(UCDSectionDetail.UcdSection ucdSection) throws SAXException { VersionInfo minVersion = ucdSection.getMinVersion(); VersionInfo maxVersion = ucdSection.getMaxVersion(); String tag = ucdSection.toString(); String childTag = ucdSection.getChildTag(); boolean parserWithRange = ucdSection.getParserWithRange(); boolean parserWithMissing = ucdSection.getParserWithMissing(); - UcdSectionComponent[] ucdSectionComponents = + UCDSectionComponent[] ucdSectionComponents = ucdSection.getUcdSectionDetail().getUcdSectionComponents(); if (isCompatibleVersion(minVersion, maxVersion)) { writer.startElement(tag); { - for (UcdSectionComponent ucdSectionComponent : ucdSectionComponents) { + for (UCDSectionComponent ucdSectionComponent : ucdSectionComponents) { if (isCompatibleVersion( ucdSectionComponent.getMinVersion(), ucdSectionComponent.getMaxVersion())) { @@ -115,7 +122,7 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep } private AttributesImpl getAttributes( - UcdSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { + UCDSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { switch (ucdSection) { case CJKRADICALS: return getCJKRadicalAttributes(namespace, line); diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java similarity index 70% rename from unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java rename to unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java index a97ef5bab9..39192fd36b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java @@ -5,22 +5,26 @@ import java.util.Set; import org.unicode.props.UcdProperty; -public class UcdPropertyDetail { +/** + * Helper class for determining how and when UCD properties should be shown in UCDXML. Also includes information + * about when a UCDProperty was added to Unicode. + */ +public class UCDPropertyDetail { - private static LinkedHashSet basePropertyDetails = - new LinkedHashSet(); - private static LinkedHashSet cjkPropertyDetails = - new LinkedHashSet(); - private static LinkedHashSet ucdxmlPropertyDetails = - new LinkedHashSet(); - private static LinkedHashSet allPropertyDetails = - new LinkedHashSet(); + private static LinkedHashSet basePropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet cjkPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet ucdxmlPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet allPropertyDetails = + new LinkedHashSet(); - public static UcdPropertyDetail Age_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Age_Detail = + new UCDPropertyDetail( UcdProperty.Age, VersionInfo.getInstance(3, 2, 0), 1, true, false, false, true); - public static UcdPropertyDetail Name_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Name_Detail = + new UCDPropertyDetail( UcdProperty.Name, VersionInfo.getInstance(1, 1, 0), 2, @@ -28,8 +32,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Jamo_Short_Name_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Jamo_Short_Name_Detail = + new UCDPropertyDetail( UcdProperty.Jamo_Short_Name, VersionInfo.getInstance(5, 1, 0), 3, @@ -37,8 +41,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail General_Category_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail General_Category_Detail = + new UCDPropertyDetail( UcdProperty.General_Category, VersionInfo.getInstance(1, 1, 0), 4, @@ -46,8 +50,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Canonical_Combining_Class_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Canonical_Combining_Class_Detail = + new UCDPropertyDetail( UcdProperty.Canonical_Combining_Class, VersionInfo.getInstance(1, 1, 0), 5, @@ -55,8 +59,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Decomposition_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Decomposition_Type_Detail = + new UCDPropertyDetail( UcdProperty.Decomposition_Type, VersionInfo.getInstance(1, 1, 0), 6, @@ -64,8 +68,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Decomposition_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Decomposition_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Decomposition_Mapping, VersionInfo.getInstance(1, 1, 0), 7, @@ -73,8 +77,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Numeric_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Numeric_Type_Detail = + new UCDPropertyDetail( UcdProperty.Numeric_Type, VersionInfo.getInstance(1, 1, 0), 8, @@ -82,8 +86,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Numeric_Value_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Numeric_Value_Detail = + new UCDPropertyDetail( UcdProperty.Numeric_Value, VersionInfo.getInstance(1, 1, 0), 9, @@ -91,8 +95,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Class_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Class_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Class, VersionInfo.getInstance(1, 1, 0), 10, @@ -100,8 +104,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Paired_Bracket_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Paired_Bracket_Type_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Paired_Bracket_Type, VersionInfo.getInstance(6, 3, 0), 11, @@ -109,8 +113,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Paired_Bracket_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Paired_Bracket_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Paired_Bracket, VersionInfo.getInstance(6, 3, 0), 12, @@ -118,8 +122,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Mirrored_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Mirrored_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Mirrored, VersionInfo.getInstance(1, 1, 0), 13, @@ -127,8 +131,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Mirroring_Glyph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Mirroring_Glyph_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Mirroring_Glyph, VersionInfo.getInstance(3, 0, 1), 14, @@ -136,8 +140,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Simple_Uppercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Uppercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Uppercase_Mapping, VersionInfo.getInstance(1, 1, 0), 15, @@ -145,8 +149,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Simple_Lowercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Lowercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Lowercase_Mapping, VersionInfo.getInstance(1, 1, 0), 16, @@ -154,8 +158,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Simple_Titlecase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Titlecase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Titlecase_Mapping, VersionInfo.getInstance(1, 1, 0), 17, @@ -163,8 +167,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Uppercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Uppercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Uppercase_Mapping, VersionInfo.getInstance(2, 1, 8), 18, @@ -172,8 +176,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Lowercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Lowercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Lowercase_Mapping, VersionInfo.getInstance(2, 1, 8), 19, @@ -181,8 +185,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Titlecase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Titlecase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Titlecase_Mapping, VersionInfo.getInstance(2, 1, 8), 20, @@ -194,8 +198,8 @@ public class UcdPropertyDetail { // ( // UcdProperty.Special_Case_Condition, VersionInfo.getInstance(1,1,0), 21, // true, false, false, true); - public static UcdPropertyDetail Simple_Case_Folding_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Case_Folding_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Case_Folding, VersionInfo.getInstance(3, 0, 1), 22, @@ -203,8 +207,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Case_Folding_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Case_Folding_Detail = + new UCDPropertyDetail( UcdProperty.Case_Folding, VersionInfo.getInstance(3, 0, 1), 23, @@ -212,8 +216,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Joining_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Joining_Type_Detail = + new UCDPropertyDetail( UcdProperty.Joining_Type, VersionInfo.getInstance(2, 0, 0), 24, @@ -221,8 +225,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Joining_Group_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Joining_Group_Detail = + new UCDPropertyDetail( UcdProperty.Joining_Group, VersionInfo.getInstance(2, 0, 0), 25, @@ -230,8 +234,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail East_Asian_Width_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail East_Asian_Width_Detail = + new UCDPropertyDetail( UcdProperty.East_Asian_Width, VersionInfo.getInstance(3, 0, 0), 26, @@ -239,8 +243,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Line_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Line_Break_Detail = + new UCDPropertyDetail( UcdProperty.Line_Break, VersionInfo.getInstance(3, 0, 0), 27, @@ -248,8 +252,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Script_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Script_Detail = + new UCDPropertyDetail( UcdProperty.Script, VersionInfo.getInstance(3, 1, 0), 28, @@ -257,8 +261,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Script_Extensions_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Script_Extensions_Detail = + new UCDPropertyDetail( UcdProperty.Script_Extensions, VersionInfo.getInstance(6, 1, 0), 29, @@ -266,8 +270,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Dash_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Dash_Detail = + new UCDPropertyDetail( UcdProperty.Dash, VersionInfo.getInstance(2, 0, 0), 30, @@ -275,8 +279,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail White_Space_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail White_Space_Detail = + new UCDPropertyDetail( UcdProperty.White_Space, VersionInfo.getInstance(2, 0, 0), 31, @@ -284,8 +288,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Hyphen_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Hyphen_Detail = + new UCDPropertyDetail( UcdProperty.Hyphen, VersionInfo.getInstance(2, 0, 0), 32, @@ -293,8 +297,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Quotation_Mark_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Quotation_Mark_Detail = + new UCDPropertyDetail( UcdProperty.Quotation_Mark, VersionInfo.getInstance(2, 0, 0), 33, @@ -302,8 +306,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Radical_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Radical_Detail = + new UCDPropertyDetail( UcdProperty.Radical, VersionInfo.getInstance(3, 2, 0), 34, @@ -311,8 +315,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Ideographic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Ideographic_Detail = + new UCDPropertyDetail( UcdProperty.Ideographic, VersionInfo.getInstance(2, 0, 0), 35, @@ -320,8 +324,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Unified_Ideograph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Unified_Ideograph_Detail = + new UCDPropertyDetail( UcdProperty.Unified_Ideograph, VersionInfo.getInstance(3, 2, 0), 36, @@ -329,8 +333,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail IDS_Binary_Operator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail IDS_Binary_Operator_Detail = + new UCDPropertyDetail( UcdProperty.IDS_Binary_Operator, VersionInfo.getInstance(3, 2, 0), 37, @@ -338,8 +342,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail IDS_Trinary_Operator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail IDS_Trinary_Operator_Detail = + new UCDPropertyDetail( UcdProperty.IDS_Trinary_Operator, VersionInfo.getInstance(3, 2, 0), 38, @@ -347,8 +351,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Hangul_Syllable_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Hangul_Syllable_Type_Detail = + new UCDPropertyDetail( UcdProperty.Hangul_Syllable_Type, VersionInfo.getInstance(4, 0, 0), 39, @@ -356,8 +360,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Default_Ignorable_Code_Point_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Default_Ignorable_Code_Point_Detail = + new UCDPropertyDetail( UcdProperty.Default_Ignorable_Code_Point, VersionInfo.getInstance(3, 2, 0), 40, @@ -365,8 +369,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Default_Ignorable_Code_Point_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Default_Ignorable_Code_Point_Detail = + new UCDPropertyDetail( UcdProperty.Other_Default_Ignorable_Code_Point, VersionInfo.getInstance(3, 2, 0), 41, @@ -374,8 +378,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Alphabetic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Alphabetic_Detail = + new UCDPropertyDetail( UcdProperty.Alphabetic, VersionInfo.getInstance(1, 1, 0), 42, @@ -383,8 +387,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Alphabetic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Alphabetic_Detail = + new UCDPropertyDetail( UcdProperty.Other_Alphabetic, VersionInfo.getInstance(3, 1, 0), 43, @@ -392,8 +396,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Uppercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Uppercase_Detail = + new UCDPropertyDetail( UcdProperty.Uppercase, VersionInfo.getInstance(3, 1, 0), 44, @@ -401,8 +405,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Uppercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Uppercase_Detail = + new UCDPropertyDetail( UcdProperty.Other_Uppercase, VersionInfo.getInstance(3, 1, 0), 45, @@ -410,8 +414,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Lowercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Lowercase_Detail = + new UCDPropertyDetail( UcdProperty.Lowercase, VersionInfo.getInstance(3, 1, 0), 46, @@ -419,8 +423,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Lowercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Lowercase_Detail = + new UCDPropertyDetail( UcdProperty.Other_Lowercase, VersionInfo.getInstance(3, 1, 0), 47, @@ -428,8 +432,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Math_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Math_Detail = + new UCDPropertyDetail( UcdProperty.Math, VersionInfo.getInstance(2, 0, 0), 48, @@ -437,8 +441,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Math_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Math_Detail = + new UCDPropertyDetail( UcdProperty.Other_Math, VersionInfo.getInstance(3, 1, 0), 49, @@ -446,8 +450,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Hex_Digit_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Hex_Digit_Detail = + new UCDPropertyDetail( UcdProperty.Hex_Digit, VersionInfo.getInstance(2, 0, 0), 50, @@ -455,8 +459,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ASCII_Hex_Digit_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ASCII_Hex_Digit_Detail = + new UCDPropertyDetail( UcdProperty.ASCII_Hex_Digit, VersionInfo.getInstance(3, 1, 1), 51, @@ -464,8 +468,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Noncharacter_Code_Point_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Noncharacter_Code_Point_Detail = + new UCDPropertyDetail( UcdProperty.Noncharacter_Code_Point, VersionInfo.getInstance(3, 0, 1), 52, @@ -473,8 +477,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Variation_Selector_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Variation_Selector_Detail = + new UCDPropertyDetail( UcdProperty.Variation_Selector, VersionInfo.getInstance(4, 0, 1), 53, @@ -482,8 +486,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Control_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Control_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Control, VersionInfo.getInstance(2, 0, 0), 54, @@ -491,8 +495,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Join_Control_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Join_Control_Detail = + new UCDPropertyDetail( UcdProperty.Join_Control, VersionInfo.getInstance(2, 0, 0), 55, @@ -500,8 +504,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Base_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Base_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Base, VersionInfo.getInstance(3, 2, 0), 56, @@ -509,8 +513,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Extend_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Extend_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Extend, VersionInfo.getInstance(3, 2, 0), 57, @@ -518,8 +522,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Grapheme_Extend_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Grapheme_Extend_Detail = + new UCDPropertyDetail( UcdProperty.Other_Grapheme_Extend, VersionInfo.getInstance(3, 2, 0), 58, @@ -527,8 +531,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Link_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Link_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Link, VersionInfo.getInstance(3, 2, 0), 59, @@ -536,8 +540,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Sentence_Terminal_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Sentence_Terminal_Detail = + new UCDPropertyDetail( UcdProperty.Sentence_Terminal, VersionInfo.getInstance(9, 0, 0), 60, @@ -545,8 +549,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Extender_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Extender_Detail = + new UCDPropertyDetail( UcdProperty.Extender, VersionInfo.getInstance(2, 0, 0), 61, @@ -554,8 +558,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Terminal_Punctuation_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Terminal_Punctuation_Detail = + new UCDPropertyDetail( UcdProperty.Terminal_Punctuation, VersionInfo.getInstance(2, 0, 0), 62, @@ -563,8 +567,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Diacritic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Diacritic_Detail = + new UCDPropertyDetail( UcdProperty.Diacritic, VersionInfo.getInstance(2, 0, 0), 63, @@ -572,8 +576,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Deprecated_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Deprecated_Detail = + new UCDPropertyDetail( UcdProperty.Deprecated, VersionInfo.getInstance(3, 2, 0), 64, @@ -581,8 +585,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Start_Detail = + new UCDPropertyDetail( UcdProperty.ID_Start, VersionInfo.getInstance(3, 1, 0), 65, @@ -590,8 +594,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_ID_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_ID_Start_Detail = + new UCDPropertyDetail( UcdProperty.Other_ID_Start, VersionInfo.getInstance(4, 0, 0), 66, @@ -599,8 +603,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail XID_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail XID_Start_Detail = + new UCDPropertyDetail( UcdProperty.XID_Start, VersionInfo.getInstance(3, 1, 0), 67, @@ -608,8 +612,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Continue_Detail = + new UCDPropertyDetail( UcdProperty.ID_Continue, VersionInfo.getInstance(3, 1, 0), 68, @@ -617,8 +621,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_ID_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_ID_Continue_Detail = + new UCDPropertyDetail( UcdProperty.Other_ID_Continue, VersionInfo.getInstance(4, 1, 0), 69, @@ -626,8 +630,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail XID_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail XID_Continue_Detail = + new UCDPropertyDetail( UcdProperty.XID_Continue, VersionInfo.getInstance(3, 1, 0), 70, @@ -635,8 +639,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Soft_Dotted_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Soft_Dotted_Detail = + new UCDPropertyDetail( UcdProperty.Soft_Dotted, VersionInfo.getInstance(3, 2, 0), 71, @@ -644,8 +648,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Logical_Order_Exception_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Logical_Order_Exception_Detail = + new UCDPropertyDetail( UcdProperty.Logical_Order_Exception, VersionInfo.getInstance(3, 2, 0), 72, @@ -653,8 +657,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Pattern_White_Space_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Pattern_White_Space_Detail = + new UCDPropertyDetail( UcdProperty.Pattern_White_Space, VersionInfo.getInstance(4, 1, 0), 73, @@ -662,8 +666,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Pattern_Syntax_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Pattern_Syntax_Detail = + new UCDPropertyDetail( UcdProperty.Pattern_Syntax, VersionInfo.getInstance(4, 1, 0), 74, @@ -671,8 +675,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Cluster_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Cluster_Break_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Cluster_Break, VersionInfo.getInstance(4, 1, 0), 75, @@ -680,8 +684,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Word_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Word_Break_Detail = + new UCDPropertyDetail( UcdProperty.Word_Break, VersionInfo.getInstance(4, 1, 0), 76, @@ -689,8 +693,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Sentence_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Sentence_Break_Detail = + new UCDPropertyDetail( UcdProperty.Sentence_Break, VersionInfo.getInstance(4, 1, 0), 77, @@ -698,8 +702,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Composition_Exclusion_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Composition_Exclusion_Detail = + new UCDPropertyDetail( UcdProperty.Composition_Exclusion, VersionInfo.getInstance(3, 0, 0), 78, @@ -707,8 +711,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Full_Composition_Exclusion_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Full_Composition_Exclusion_Detail = + new UCDPropertyDetail( UcdProperty.Full_Composition_Exclusion, VersionInfo.getInstance(3, 1, 0), 79, @@ -716,8 +720,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFC_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFC_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFC_Quick_Check, VersionInfo.getInstance(3, 2, 0), 80, @@ -725,8 +729,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFD_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFD_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFD_Quick_Check, VersionInfo.getInstance(3, 2, 0), 81, @@ -734,8 +738,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFKC_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKC_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFKC_Quick_Check, VersionInfo.getInstance(5, 2, 0), 82, @@ -743,8 +747,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFKD_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKD_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFKD_Quick_Check, VersionInfo.getInstance(3, 2, 0), 83, @@ -752,8 +756,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFC_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFC_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFC, VersionInfo.getInstance(3, 2, 0), 84, @@ -761,8 +765,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFD_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFD_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFD, VersionInfo.getInstance(3, 2, 0), 85, @@ -770,8 +774,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFKC_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFKC_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFKC, VersionInfo.getInstance(3, 2, 0), 86, @@ -779,8 +783,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFKD_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFKD_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFKD, VersionInfo.getInstance(3, 2, 0), 87, @@ -788,8 +792,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail FC_NFC_Closure_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail FC_NFC_Closure_Detail = + new UCDPropertyDetail( UcdProperty.FC_NFKC_Closure, VersionInfo.getInstance(3, 1, 0), 88, @@ -797,8 +801,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Case_Ignorable_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Case_Ignorable_Detail = + new UCDPropertyDetail( UcdProperty.Case_Ignorable, VersionInfo.getInstance(5, 2, 0), 89, @@ -806,8 +810,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Cased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Cased_Detail = + new UCDPropertyDetail( UcdProperty.Cased, VersionInfo.getInstance(5, 2, 0), 90, @@ -815,8 +819,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_CaseFolded_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_CaseFolded_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Casefolded, VersionInfo.getInstance(5, 2, 0), 91, @@ -824,8 +828,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_CaseMapped_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_CaseMapped_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Casemapped, VersionInfo.getInstance(5, 2, 0), 92, @@ -833,8 +837,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_NFKC_Casefolded_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_NFKC_Casefolded_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_NFKC_Casefolded, VersionInfo.getInstance(5, 2, 0), 93, @@ -842,8 +846,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_Lowercased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_Lowercased_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Lowercased, VersionInfo.getInstance(5, 2, 0), 94, @@ -851,8 +855,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_Titlecased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_Titlecased_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Titlecased, VersionInfo.getInstance(5, 2, 0), 95, @@ -860,8 +864,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_Uppercased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_Uppercased_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Uppercased, VersionInfo.getInstance(5, 2, 0), 96, @@ -869,8 +873,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFKC_Casefold_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKC_Casefold_Detail = + new UCDPropertyDetail( UcdProperty.NFKC_Casefold, VersionInfo.getInstance(5, 2, 0), 97, @@ -878,8 +882,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Indic_Syllabic_Category_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Indic_Syllabic_Category_Detail = + new UCDPropertyDetail( UcdProperty.Indic_Syllabic_Category, VersionInfo.getInstance(6, 1, 0), 98, @@ -891,8 +895,8 @@ public class UcdPropertyDetail { // UcdProperty.Indic_Matra_Category, VersionInfo.getInstance(6,1,0), // VersionInfo.getInstance(7,0,0), 99, // true, false, false, true); - public static UcdPropertyDetail Indic_Positional_Category_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Indic_Positional_Category_Detail = + new UCDPropertyDetail( UcdProperty.Indic_Positional_Category, VersionInfo.getInstance(8, 0, 0), 100, @@ -900,8 +904,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kJa_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJa_Detail = + new UCDPropertyDetail( UcdProperty.kJa, VersionInfo.getInstance(8, 0, 0), 101, @@ -909,8 +913,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail Prepended_Concatenation_Mark_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Prepended_Concatenation_Mark_Detail = + new UCDPropertyDetail( UcdProperty.Prepended_Concatenation_Mark, VersionInfo.getInstance(9, 0, 0), 102, @@ -918,8 +922,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Vertical_Orientation_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Vertical_Orientation_Detail = + new UCDPropertyDetail( UcdProperty.Vertical_Orientation, VersionInfo.getInstance(10, 0, 0), 103, @@ -927,8 +931,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Regional_Indicator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Regional_Indicator_Detail = + new UCDPropertyDetail( UcdProperty.Regional_Indicator, VersionInfo.getInstance(10, 0, 0), 104, @@ -936,8 +940,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Block_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Block_Detail = + new UCDPropertyDetail( UcdProperty.Block, VersionInfo.getInstance(2, 0, 0), 105, @@ -945,8 +949,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Equivalent_Unified_Ideograph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Equivalent_Unified_Ideograph_Detail = + new UCDPropertyDetail( UcdProperty.Equivalent_Unified_Ideograph, VersionInfo.getInstance(11, 0, 0), 106, @@ -954,8 +958,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCompatibilityVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCompatibilityVariant_Detail = + new UCDPropertyDetail( UcdProperty.kCompatibilityVariant, VersionInfo.getInstance(3, 2, 0), 107, @@ -963,8 +967,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kRSUnicode_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSUnicode_Detail = + new UCDPropertyDetail( UcdProperty.kRSUnicode, VersionInfo.getInstance(2, 0, 0), 108, @@ -975,8 +979,8 @@ public class UcdPropertyDetail { // public static UcdPropertyDetail kIRG_RSIndex_Detail = new UcdPropertyDetail ( // UcdProperty.kIRG_RSIndex, VersionInfo.getInstance(11,0,0), 109, // false, true, false, true); - public static UcdPropertyDetail kIRG_GSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_GSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_GSource, VersionInfo.getInstance(3, 0, 0), 110, @@ -984,8 +988,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_TSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_TSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_TSource, VersionInfo.getInstance(3, 0, 0), 111, @@ -993,8 +997,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_JSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_JSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_JSource, VersionInfo.getInstance(3, 0, 0), 112, @@ -1002,8 +1006,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_KSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_KSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_KSource, VersionInfo.getInstance(3, 0, 0), 113, @@ -1011,8 +1015,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_KPSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_KPSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_KPSource, VersionInfo.getInstance(3, 1, 1), 114, @@ -1020,8 +1024,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_VSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_VSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_VSource, VersionInfo.getInstance(3, 0, 0), 115, @@ -1029,8 +1033,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_HSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_HSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_HSource, VersionInfo.getInstance(3, 1, 0), 116, @@ -1038,8 +1042,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_USource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_USource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_USource, VersionInfo.getInstance(4, 0, 1), 117, @@ -1047,8 +1051,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_MSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_MSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_MSource, VersionInfo.getInstance(5, 2, 0), 118, @@ -1056,8 +1060,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_UKSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_UKSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_UKSource, VersionInfo.getInstance(13, 0, 0), 119, @@ -1065,8 +1069,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_SSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_SSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_SSource, VersionInfo.getInstance(13, 0, 0), 120, @@ -1074,8 +1078,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIICore_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIICore_Detail = + new UCDPropertyDetail( UcdProperty.kIICore, VersionInfo.getInstance(4, 1, 0), 121, @@ -1083,8 +1087,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kUnihanCore2020_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kUnihanCore2020_Detail = + new UCDPropertyDetail( UcdProperty.kUnihanCore2020, VersionInfo.getInstance(13, 0, 0), 122, @@ -1092,8 +1096,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB0_Detail = + new UCDPropertyDetail( UcdProperty.kGB0, VersionInfo.getInstance(2, 0, 0), 123, @@ -1101,8 +1105,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB1_Detail = + new UCDPropertyDetail( UcdProperty.kGB1, VersionInfo.getInstance(2, 0, 0), 124, @@ -1110,8 +1114,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB3_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB3_Detail = + new UCDPropertyDetail( UcdProperty.kGB3, VersionInfo.getInstance(2, 0, 0), 125, @@ -1119,8 +1123,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB5_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB5_Detail = + new UCDPropertyDetail( UcdProperty.kGB5, VersionInfo.getInstance(2, 0, 0), 126, @@ -1128,8 +1132,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB7_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB7_Detail = + new UCDPropertyDetail( UcdProperty.kGB7, VersionInfo.getInstance(2, 0, 0), 127, @@ -1137,8 +1141,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB8_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB8_Detail = + new UCDPropertyDetail( UcdProperty.kGB8, VersionInfo.getInstance(2, 0, 0), 128, @@ -1146,8 +1150,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCNS1986_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCNS1986_Detail = + new UCDPropertyDetail( UcdProperty.kCNS1986, VersionInfo.getInstance(2, 0, 0), 129, @@ -1155,8 +1159,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCNS1992_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCNS1992_Detail = + new UCDPropertyDetail( UcdProperty.kCNS1992, VersionInfo.getInstance(2, 0, 0), 130, @@ -1164,8 +1168,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJis0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJis0_Detail = + new UCDPropertyDetail( UcdProperty.kJis0, VersionInfo.getInstance(2, 0, 0), 131, @@ -1173,8 +1177,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJis1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJis1_Detail = + new UCDPropertyDetail( UcdProperty.kJis1, VersionInfo.getInstance(2, 0, 0), 132, @@ -1182,8 +1186,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJIS0213_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJIS0213_Detail = + new UCDPropertyDetail( UcdProperty.kJIS0213, VersionInfo.getInstance(3, 1, 1), 133, @@ -1191,8 +1195,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKSC0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKSC0_Detail = + new UCDPropertyDetail( UcdProperty.kKSC0, VersionInfo.getInstance(2, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1201,8 +1205,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKSC1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKSC1_Detail = + new UCDPropertyDetail( UcdProperty.kKSC1, VersionInfo.getInstance(2, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1211,8 +1215,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKPS0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKPS0_Detail = + new UCDPropertyDetail( UcdProperty.kKPS0, VersionInfo.getInstance(3, 1, 1), VersionInfo.getInstance(15, 1, 0), @@ -1221,8 +1225,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKPS1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKPS1_Detail = + new UCDPropertyDetail( UcdProperty.kKPS1, VersionInfo.getInstance(3, 1, 1), VersionInfo.getInstance(15, 1, 0), @@ -1231,8 +1235,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHKSCS_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHKSCS_Detail = + new UCDPropertyDetail( UcdProperty.kHKSCS, VersionInfo.getInstance(3, 1, 1), VersionInfo.getInstance(15, 1, 0), @@ -1241,8 +1245,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCantonese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCantonese_Detail = + new UCDPropertyDetail( UcdProperty.kCantonese, VersionInfo.getInstance(2, 0, 0), 139, @@ -1250,8 +1254,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHangul_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHangul_Detail = + new UCDPropertyDetail( UcdProperty.kHangul, VersionInfo.getInstance(5, 0, 0), 140, @@ -1259,8 +1263,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kDefinition_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kDefinition_Detail = + new UCDPropertyDetail( UcdProperty.kDefinition, VersionInfo.getInstance(2, 0, 0), 141, @@ -1268,8 +1272,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHanYu_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHanYu_Detail = + new UCDPropertyDetail( UcdProperty.kHanYu, VersionInfo.getInstance(2, 0, 0), 142, @@ -1281,8 +1285,8 @@ public class UcdPropertyDetail { // UcdProperty.kAlternateHanYu, VersionInfo.getInstance(2,0,0), // VersionInfo.getInstance(3,1,1), 143, // false, true, false, true); - public static UcdPropertyDetail kMandarin_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMandarin_Detail = + new UCDPropertyDetail( UcdProperty.kMandarin, VersionInfo.getInstance(2, 0, 0), 144, @@ -1290,8 +1294,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCihaiT_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCihaiT_Detail = + new UCDPropertyDetail( UcdProperty.kCihaiT, VersionInfo.getInstance(3, 2, 0), 145, @@ -1299,8 +1303,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSBGY_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSBGY_Detail = + new UCDPropertyDetail( UcdProperty.kSBGY, VersionInfo.getInstance(3, 2, 0), 146, @@ -1308,8 +1312,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kNelson_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kNelson_Detail = + new UCDPropertyDetail( UcdProperty.kNelson, VersionInfo.getInstance(2, 0, 0), 147, @@ -1317,8 +1321,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCowles_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCowles_Detail = + new UCDPropertyDetail( UcdProperty.kCowles, VersionInfo.getInstance(3, 1, 1), 148, @@ -1326,8 +1330,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMatthews_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMatthews_Detail = + new UCDPropertyDetail( UcdProperty.kMatthews, VersionInfo.getInstance(2, 0, 0), 149, @@ -1335,8 +1339,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kOtherNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kOtherNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kOtherNumeric, VersionInfo.getInstance(3, 2, 0), 150, @@ -1344,8 +1348,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kPhonetic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kPhonetic_Detail = + new UCDPropertyDetail( UcdProperty.kPhonetic, VersionInfo.getInstance(3, 1, 0), 151, @@ -1353,8 +1357,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGSR_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGSR_Detail = + new UCDPropertyDetail( UcdProperty.kGSR, VersionInfo.getInstance(4, 0, 1), 152, @@ -1362,8 +1366,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFenn_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFenn_Detail = + new UCDPropertyDetail( UcdProperty.kFenn, VersionInfo.getInstance(3, 1, 1), 153, @@ -1371,8 +1375,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFennIndex_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFennIndex_Detail = + new UCDPropertyDetail( UcdProperty.kFennIndex, VersionInfo.getInstance(4, 1, 0), 154, @@ -1380,8 +1384,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKarlgren_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKarlgren_Detail = + new UCDPropertyDetail( UcdProperty.kKarlgren, VersionInfo.getInstance(3, 1, 1), 155, @@ -1389,8 +1393,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCangjie_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCangjie_Detail = + new UCDPropertyDetail( UcdProperty.kCangjie, VersionInfo.getInstance(3, 1, 1), 156, @@ -1398,8 +1402,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMeyerWempe_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMeyerWempe_Detail = + new UCDPropertyDetail( UcdProperty.kMeyerWempe, VersionInfo.getInstance(3, 1, 0), 157, @@ -1407,8 +1411,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSimplifiedVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSimplifiedVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSimplifiedVariant, VersionInfo.getInstance(2, 0, 0), 158, @@ -1416,8 +1420,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTraditionalVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTraditionalVariant_Detail = + new UCDPropertyDetail( UcdProperty.kTraditionalVariant, VersionInfo.getInstance(2, 0, 0), 159, @@ -1425,8 +1429,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSpecializedSemanticVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSpecializedSemanticVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSpecializedSemanticVariant, VersionInfo.getInstance(2, 0, 0), 160, @@ -1434,8 +1438,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSemanticVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSemanticVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSemanticVariant, VersionInfo.getInstance(2, 0, 0), 161, @@ -1443,8 +1447,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kVietnamese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kVietnamese_Detail = + new UCDPropertyDetail( UcdProperty.kVietnamese, VersionInfo.getInstance(3, 1, 1), 162, @@ -1452,8 +1456,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kLau_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kLau_Detail = + new UCDPropertyDetail( UcdProperty.kLau, VersionInfo.getInstance(3, 1, 1), 163, @@ -1461,8 +1465,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTang_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTang_Detail = + new UCDPropertyDetail( UcdProperty.kTang, VersionInfo.getInstance(2, 0, 0), 164, @@ -1470,8 +1474,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kZVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kZVariant_Detail = + new UCDPropertyDetail( UcdProperty.kZVariant, VersionInfo.getInstance(2, 0, 0), 165, @@ -1479,8 +1483,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJapaneseKun_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJapaneseKun_Detail = + new UCDPropertyDetail( UcdProperty.kJapaneseKun, VersionInfo.getInstance(2, 0, 0), 166, @@ -1488,8 +1492,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJapaneseOn_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJapaneseOn_Detail = + new UCDPropertyDetail( UcdProperty.kJapaneseOn, VersionInfo.getInstance(2, 0, 0), 167, @@ -1497,8 +1501,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKangXi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKangXi_Detail = + new UCDPropertyDetail( UcdProperty.kKangXi, VersionInfo.getInstance(2, 0, 0), 168, @@ -1510,8 +1514,8 @@ public class UcdPropertyDetail { // UcdProperty.kAlternateKangXi, VersionInfo.getInstance(2,0,0), // VersionInfo.getInstance(4,0,1), 169, // false, true, false, true); - public static UcdPropertyDetail kBigFive_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kBigFive_Detail = + new UCDPropertyDetail( UcdProperty.kBigFive, VersionInfo.getInstance(2, 0, 0), 170, @@ -1519,8 +1523,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCCCII_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCCCII_Detail = + new UCDPropertyDetail( UcdProperty.kCCCII, VersionInfo.getInstance(2, 0, 0), 171, @@ -1528,8 +1532,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kDaeJaweon_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kDaeJaweon_Detail = + new UCDPropertyDetail( UcdProperty.kDaeJaweon, VersionInfo.getInstance(2, 0, 0), 172, @@ -1537,8 +1541,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kEACC_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kEACC_Detail = + new UCDPropertyDetail( UcdProperty.kEACC, VersionInfo.getInstance(2, 0, 0), 173, @@ -1546,8 +1550,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFrequency_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFrequency_Detail = + new UCDPropertyDetail( UcdProperty.kFrequency, VersionInfo.getInstance(3, 2, 0), VersionInfo.getInstance(16, 0, 0), @@ -1556,8 +1560,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGradeLevel_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGradeLevel_Detail = + new UCDPropertyDetail( UcdProperty.kGradeLevel, VersionInfo.getInstance(3, 2, 0), 175, @@ -1565,8 +1569,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHDZRadBreak_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHDZRadBreak_Detail = + new UCDPropertyDetail( UcdProperty.kHDZRadBreak, VersionInfo.getInstance(4, 1, 0), 176, @@ -1574,8 +1578,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHKGlyph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHKGlyph_Detail = + new UCDPropertyDetail( UcdProperty.kHKGlyph, VersionInfo.getInstance(3, 1, 1), 177, @@ -1583,8 +1587,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHanyuPinlu_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHanyuPinlu_Detail = + new UCDPropertyDetail( UcdProperty.kHanyuPinlu, VersionInfo.getInstance(4, 0, 1), 178, @@ -1592,8 +1596,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHanyuPinyin_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHanyuPinyin_Detail = + new UCDPropertyDetail( UcdProperty.kHanyuPinyin, VersionInfo.getInstance(5, 2, 0), 179, @@ -1601,8 +1605,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGHanyuDaZidian_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGHanyuDaZidian_Detail = + new UCDPropertyDetail( UcdProperty.kIRGHanyuDaZidian, VersionInfo.getInstance(3, 0, 0), 180, @@ -1610,8 +1614,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGKangXi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGKangXi_Detail = + new UCDPropertyDetail( UcdProperty.kIRGKangXi, VersionInfo.getInstance(3, 0, 0), 181, @@ -1619,8 +1623,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGDaeJaweon_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGDaeJaweon_Detail = + new UCDPropertyDetail( UcdProperty.kIRGDaeJaweon, VersionInfo.getInstance(3, 0, 0), 182, @@ -1628,8 +1632,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGDaiKanwaZiten_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGDaiKanwaZiten_Detail = + new UCDPropertyDetail( UcdProperty.kIRGDaiKanwaZiten, VersionInfo.getInstance(3, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1638,8 +1642,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKorean_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKorean_Detail = + new UCDPropertyDetail( UcdProperty.kKorean, VersionInfo.getInstance(2, 0, 0), 184, @@ -1647,8 +1651,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMainlandTelegraph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMainlandTelegraph_Detail = + new UCDPropertyDetail( UcdProperty.kMainlandTelegraph, VersionInfo.getInstance(2, 0, 0), 185, @@ -1656,8 +1660,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMorohashi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMorohashi_Detail = + new UCDPropertyDetail( UcdProperty.kMorohashi, VersionInfo.getInstance(2, 0, 0), 186, @@ -1669,8 +1673,8 @@ public class UcdPropertyDetail { // UcdProperty.kAlternateMorohashi, VersionInfo.getInstance(2,0,0), // VersionInfo.getInstance(4,0,1), 187, // false, true, false, true); - public static UcdPropertyDetail kPrimaryNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kPrimaryNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kPrimaryNumeric, VersionInfo.getInstance(3, 2, 0), 188, @@ -1678,8 +1682,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTaiwanTelegraph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTaiwanTelegraph_Detail = + new UCDPropertyDetail( UcdProperty.kTaiwanTelegraph, VersionInfo.getInstance(2, 0, 0), 189, @@ -1687,8 +1691,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kXerox_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kXerox_Detail = + new UCDPropertyDetail( UcdProperty.kXerox, VersionInfo.getInstance(2, 0, 0), 190, @@ -1696,8 +1700,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kPseudoGB1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kPseudoGB1_Detail = + new UCDPropertyDetail( UcdProperty.kPseudoGB1, VersionInfo.getInstance(2, 0, 0), 191, @@ -1705,8 +1709,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIBMJapan_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIBMJapan_Detail = + new UCDPropertyDetail( UcdProperty.kIBMJapan, VersionInfo.getInstance(2, 0, 0), 192, @@ -1714,8 +1718,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kAccountingNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kAccountingNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kAccountingNumeric, VersionInfo.getInstance(3, 2, 0), 193, @@ -1723,8 +1727,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCheungBauer_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCheungBauer_Detail = + new UCDPropertyDetail( UcdProperty.kCheungBauer, VersionInfo.getInstance(5, 0, 0), 194, @@ -1732,8 +1736,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCheungBauerIndex_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCheungBauerIndex_Detail = + new UCDPropertyDetail( UcdProperty.kCheungBauerIndex, VersionInfo.getInstance(5, 0, 0), 195, @@ -1741,8 +1745,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFourCornerCode_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFourCornerCode_Detail = + new UCDPropertyDetail( UcdProperty.kFourCornerCode, VersionInfo.getInstance(5, 0, 0), 196, @@ -1753,8 +1757,8 @@ public class UcdPropertyDetail { // public static UcdPropertyDetail kWubi_Detail = new UcdPropertyDetail ( // UcdProperty.kWubi, VersionInfo.getInstance(11,0,0), 197, // false, true, false, true); - public static UcdPropertyDetail kXHC1983_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kXHC1983_Detail = + new UCDPropertyDetail( UcdProperty.kXHC1983, VersionInfo.getInstance(5, 1, 0), 198, @@ -1762,8 +1766,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJinmeiyoKanji_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJinmeiyoKanji_Detail = + new UCDPropertyDetail( UcdProperty.kJinmeiyoKanji, VersionInfo.getInstance(11, 0, 0), 199, @@ -1771,8 +1775,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJoyoKanji_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJoyoKanji_Detail = + new UCDPropertyDetail( UcdProperty.kJoyoKanji, VersionInfo.getInstance(11, 0, 0), 200, @@ -1780,8 +1784,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKoreanEducationHanja_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKoreanEducationHanja_Detail = + new UCDPropertyDetail( UcdProperty.kKoreanEducationHanja, VersionInfo.getInstance(11, 0, 0), 201, @@ -1789,8 +1793,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKoreanName_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKoreanName_Detail = + new UCDPropertyDetail( UcdProperty.kKoreanName, VersionInfo.getInstance(11, 0, 0), 202, @@ -1798,8 +1802,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTGH_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTGH_Detail = + new UCDPropertyDetail( UcdProperty.kTGH, VersionInfo.getInstance(11, 0, 0), 203, @@ -1807,8 +1811,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTGHZ2013_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTGHZ2013_Detail = + new UCDPropertyDetail( UcdProperty.kTGHZ2013, VersionInfo.getInstance(13, 0, 0), 204, @@ -1816,8 +1820,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSpoofingVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSpoofingVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSpoofingVariant, VersionInfo.getInstance(13, 0, 0), 205, @@ -1825,8 +1829,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSKanWa_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSKanWa_Detail = + new UCDPropertyDetail( UcdProperty.kRSKanWa, VersionInfo.getInstance(2, 0, 0), 206, @@ -1834,8 +1838,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSJapanese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSJapanese_Detail = + new UCDPropertyDetail( UcdProperty.kRSJapanese, VersionInfo.getInstance(2, 0, 0), 207, @@ -1843,8 +1847,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSKorean_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSKorean_Detail = + new UCDPropertyDetail( UcdProperty.kRSKorean, VersionInfo.getInstance(2, 0, 0), 208, @@ -1852,8 +1856,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSKangXi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSKangXi_Detail = + new UCDPropertyDetail( UcdProperty.kRSKangXi, VersionInfo.getInstance(2, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1862,8 +1866,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSAdobe_Japan1_6_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSAdobe_Japan1_6_Detail = + new UCDPropertyDetail( UcdProperty.kRSAdobe_Japan1_6, VersionInfo.getInstance(4, 1, 0), 210, @@ -1871,8 +1875,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTotalStrokes_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTotalStrokes_Detail = + new UCDPropertyDetail( UcdProperty.kTotalStrokes, VersionInfo.getInstance(3, 1, 0), 211, @@ -1880,8 +1884,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSTUnicode_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSTUnicode_Detail = + new UCDPropertyDetail( UcdProperty.kRSTUnicode, VersionInfo.getInstance(9, 0, 0), 212, @@ -1889,8 +1893,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTGT_MergedSrc_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTGT_MergedSrc_Detail = + new UCDPropertyDetail( UcdProperty.kTGT_MergedSrc, VersionInfo.getInstance(9, 0, 0), 213, @@ -1898,8 +1902,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSrc_NushuDuben_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSrc_NushuDuben_Detail = + new UCDPropertyDetail( UcdProperty.kSrc_NushuDuben, VersionInfo.getInstance(10, 0, 0), 214, @@ -1907,8 +1911,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kReading_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kReading_Detail = + new UCDPropertyDetail( UcdProperty.kReading, VersionInfo.getInstance(10, 0, 0), 215, @@ -1916,8 +1920,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail ISO_Comment_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ISO_Comment_Detail = + new UCDPropertyDetail( UcdProperty.ISO_Comment, VersionInfo.getInstance(11, 0, 0), 216, @@ -1925,8 +1929,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Unicode_1_Name_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Unicode_1_Name_Detail = + new UCDPropertyDetail( UcdProperty.Unicode_1_Name, VersionInfo.getInstance(2, 0, 0), 217, @@ -1934,8 +1938,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Name_Alias_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Name_Alias_Detail = + new UCDPropertyDetail( UcdProperty.Name_Alias, VersionInfo.getInstance(5, 0, 0), 218, @@ -1943,8 +1947,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Detail = + new UCDPropertyDetail( UcdProperty.Emoji, VersionInfo.getInstance(13, 0, 0), 219, @@ -1952,8 +1956,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Presentation_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Presentation_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Presentation, VersionInfo.getInstance(13, 0, 0), 220, @@ -1961,8 +1965,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Modifier_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Modifier_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Modifier, VersionInfo.getInstance(13, 0, 0), 221, @@ -1970,8 +1974,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Modifier_Base_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Modifier_Base_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Modifier_Base, VersionInfo.getInstance(13, 0, 0), 222, @@ -1979,8 +1983,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Component_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Component_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Component, VersionInfo.getInstance(13, 0, 0), 223, @@ -1988,8 +1992,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Extended_Pictographic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Extended_Pictographic_Detail = + new UCDPropertyDetail( UcdProperty.Extended_Pictographic, VersionInfo.getInstance(13, 0, 0), 224, @@ -1997,8 +2001,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kStrange_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kStrange_Detail = + new UCDPropertyDetail( UcdProperty.kStrange, VersionInfo.getInstance(14, 0, 0), 225, @@ -2006,8 +2010,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kAlternateTotalStrokes_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kAlternateTotalStrokes_Detail = + new UCDPropertyDetail( UcdProperty.kAlternateTotalStrokes, VersionInfo.getInstance(15, 0, 0), 226, @@ -2015,8 +2019,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail NFKC_Simple_Casefold_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKC_Simple_Casefold_Detail = + new UCDPropertyDetail( UcdProperty.NFKC_Simple_Casefold, VersionInfo.getInstance(15, 1, 0), 227, @@ -2024,8 +2028,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Compat_Math_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Compat_Math_Start_Detail = + new UCDPropertyDetail( UcdProperty.ID_Compat_Math_Start, VersionInfo.getInstance(15, 1, 0), 228, @@ -2033,8 +2037,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Compat_Math_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Compat_Math_Continue_Detail = + new UCDPropertyDetail( UcdProperty.ID_Compat_Math_Continue, VersionInfo.getInstance(15, 1, 0), 229, @@ -2042,8 +2046,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail IDS_Unary_Operator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail IDS_Unary_Operator_Detail = + new UCDPropertyDetail( UcdProperty.IDS_Unary_Operator, VersionInfo.getInstance(15, 1, 0), 230, @@ -2051,8 +2055,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kJapanese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJapanese_Detail = + new UCDPropertyDetail( UcdProperty.kJapanese, VersionInfo.getInstance(15, 1, 0), 231, @@ -2060,8 +2064,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMojiJoho_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMojiJoho_Detail = + new UCDPropertyDetail( UcdProperty.kMojiJoho, VersionInfo.getInstance(15, 1, 0), 232, @@ -2069,8 +2073,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSMSZD2003Index_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSMSZD2003Index_Detail = + new UCDPropertyDetail( UcdProperty.kSMSZD2003Index, VersionInfo.getInstance(15, 1, 0), 233, @@ -2078,8 +2082,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSMSZD2003Readings_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSMSZD2003Readings_Detail = + new UCDPropertyDetail( UcdProperty.kSMSZD2003Readings, VersionInfo.getInstance(15, 1, 0), 234, @@ -2087,8 +2091,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kVietnameseNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kVietnameseNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kVietnameseNumeric, VersionInfo.getInstance(15, 1, 0), 235, @@ -2096,8 +2100,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kZhuangNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kZhuangNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kZhuangNumeric, VersionInfo.getInstance(15, 1, 0), 236, @@ -2105,8 +2109,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail Indic_Conjunct_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Indic_Conjunct_Break_Detail = + new UCDPropertyDetail( UcdProperty.Indic_Conjunct_Break, VersionInfo.getInstance(15, 1, 0), 237, @@ -2114,8 +2118,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Modifier_Combining_Mark_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Modifier_Combining_Mark_Detail = + new UCDPropertyDetail( UcdProperty.Modifier_Combining_Mark, VersionInfo.getInstance(16, 0, 0), 238, @@ -2123,8 +2127,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kFanqie_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFanqie_Detail = + new UCDPropertyDetail( UcdProperty.kFanqie, VersionInfo.getInstance(16, 0, 0), 239, @@ -2132,8 +2136,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kZhuang_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kZhuang_Detail = + new UCDPropertyDetail( UcdProperty.kZhuang, VersionInfo.getInstance(16, 0, 0), 240, @@ -2141,25 +2145,25 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail Basic_Emoji_Detail = - new UcdPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); - public static UcdPropertyDetail CJK_Radical_Detail = - new UcdPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); - public static UcdPropertyDetail Confusable_MA_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); - public static UcdPropertyDetail Confusable_ML_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); - public static UcdPropertyDetail Confusable_SA_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); - public static UcdPropertyDetail Confusable_SL_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); - public static UcdPropertyDetail Do_Not_Emit_Preferred_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Basic_Emoji_Detail = + new UCDPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); + public static UCDPropertyDetail CJK_Radical_Detail = + new UCDPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); + public static UCDPropertyDetail Confusable_MA_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); + public static UCDPropertyDetail Confusable_ML_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); + public static UCDPropertyDetail Confusable_SA_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); + public static UCDPropertyDetail Confusable_SL_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); + public static UCDPropertyDetail Do_Not_Emit_Preferred_Detail = + new UCDPropertyDetail( UcdProperty.Do_Not_Emit_Preferred, -7, false, false, false, false); - public static UcdPropertyDetail Do_Not_Emit_Type_Detail = - new UcdPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); - public static UcdPropertyDetail Emoji_DCM_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Do_Not_Emit_Type_Detail = + new UCDPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); + public static UCDPropertyDetail Emoji_DCM_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_DCM, VersionInfo.getInstance(6, 0, 0), -9, @@ -2167,8 +2171,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Emoji_KDDI_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_KDDI_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_KDDI, VersionInfo.getInstance(6, 0, 0), -10, @@ -2176,8 +2180,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Emoji_SB_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_SB_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_SB, VersionInfo.getInstance(6, 0, 0), -11, @@ -2185,8 +2189,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Identifier_Status_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Identifier_Status_Detail = + new UCDPropertyDetail( UcdProperty.Identifier_Status, VersionInfo.getInstance(9, 0, 0), -12, @@ -2194,8 +2198,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Identifier_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Identifier_Type_Detail = + new UCDPropertyDetail( UcdProperty.Identifier_Type, VersionInfo.getInstance(9, 0, 0), -13, @@ -2203,36 +2207,36 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Idn_2008_Detail = - new UcdPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); - public static UcdPropertyDetail Idn_Mapping_Detail = - new UcdPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); - public static UcdPropertyDetail Idn_Status_Detail = - new UcdPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); - public static UcdPropertyDetail Named_Sequences_Detail = - new UcdPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); - public static UcdPropertyDetail Named_Sequences_Prov_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Idn_2008_Detail = + new UCDPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); + public static UCDPropertyDetail Idn_Mapping_Detail = + new UCDPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); + public static UCDPropertyDetail Idn_Status_Detail = + new UCDPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); + public static UCDPropertyDetail Named_Sequences_Detail = + new UCDPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); + public static UCDPropertyDetail Named_Sequences_Prov_Detail = + new UCDPropertyDetail( UcdProperty.Named_Sequences_Prov, -18, false, false, false, false); - public static UcdPropertyDetail Other_Joining_Type_Detail = - new UcdPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Flag_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Joining_Type_Detail = + new UCDPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Flag_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Flag_Sequence, -20, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Keycap_Sequence, -21, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Modifier_Sequence, -22, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Tag_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Tag_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Tag_Sequence, -23, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Zwj_Sequence, -24, false, false, false, false); - public static UcdPropertyDetail Standardized_Variant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Standardized_Variant_Detail = + new UCDPropertyDetail( UcdProperty.Standardized_Variant, -25, false, false, false, false); private UcdProperty ucdProperty; @@ -2244,7 +2248,7 @@ public class UcdPropertyDetail { private boolean isCJKShowIfEmpty; private boolean isOrgUCDXMLAttribute; - private UcdPropertyDetail( + private UCDPropertyDetail( UcdProperty ucdProperty, VersionInfo minVersion, int sortOrder, @@ -2263,7 +2267,7 @@ private UcdPropertyDetail( isOrgUCDXMLAttribute); } - private UcdPropertyDetail( + private UCDPropertyDetail( UcdProperty ucdProperty, int sortOrder, boolean isBaseAttribute, @@ -2281,7 +2285,7 @@ private UcdPropertyDetail( isOrgUCDXMLAttribute); } - private UcdPropertyDetail( + private UCDPropertyDetail( UcdProperty ucdProperty, VersionInfo minVersion, VersionInfo maxVersion, @@ -2310,19 +2314,19 @@ private UcdPropertyDetail( } } - public static Set values() { + public static Set values() { return allPropertyDetails; } - public static Set baseValues() { + public static Set baseValues() { return basePropertyDetails; } - public static Set cjkValues() { + public static Set cjkValues() { return cjkPropertyDetails; } - public static Set ucdxmlValues() { + public static Set ucdxmlValues() { return ucdxmlPropertyDetails; } diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java similarity index 75% rename from unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java rename to unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java index 0773486ccf..550fcbbaf7 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java @@ -3,12 +3,15 @@ import com.ibm.icu.util.VersionInfo; import org.unicode.props.UcdProperty; -public class UcdSectionComponent { +/** + * Helper class that defines an object that stores the version range of a given UcdProperty. + */ +public class UCDSectionComponent { private final VersionInfo minVersion; private final VersionInfo maxVersion; private final UcdProperty ucdProperty; - UcdSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { + UCDSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { this.minVersion = minVersion; this.maxVersion = maxVersion; this.ucdProperty = ucdProperty; diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java similarity index 72% rename from unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java rename to unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java index ceed693afd..ac84a5a414 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java @@ -3,7 +3,12 @@ import com.ibm.icu.util.VersionInfo; import org.unicode.props.UcdProperty; -public class UcdSectionDetail { +/** + * Helper class that defines an object that stores information about a section of the UCDXML file. + * Information includes the section name, the type of elements that the section contains, and the version range of + * the section. + */ +public class UCDSectionDetail { public enum UcdSection { BLOCKS( @@ -74,7 +79,7 @@ public enum UcdSection { private final String childTag; private final VersionInfo minVersion; private final VersionInfo maxVersion; - private final UcdSectionDetail ucdSectionDetail; + private final UCDSectionDetail ucdSectionDetail; private final boolean parserWithRange; private final boolean parserWithMissing; @@ -83,7 +88,7 @@ public enum UcdSection { String childTag, VersionInfo minVersion, VersionInfo maxVersion, - UcdSectionDetail ucdSectionDetail, + UCDSectionDetail ucdSectionDetail, boolean parserWithRange, boolean parserWithMissing) { this.tag = tag; @@ -111,7 +116,7 @@ public VersionInfo getMaxVersion() { return maxVersion; } - public UcdSectionDetail getUcdSectionDetail() { + public UCDSectionDetail getUcdSectionDetail() { return ucdSectionDetail; } @@ -124,75 +129,75 @@ public boolean getParserWithMissing() { } } - public static UcdSectionDetail Blocks_Detail = - new UcdSectionDetail( + public static UCDSectionDetail Blocks_Detail = + new UCDSectionDetail( UcdSection.BLOCKS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Block) }, 0); - public static UcdSectionDetail NamedSequences_Detail = - new UcdSectionDetail( + public static UCDSectionDetail NamedSequences_Detail = + new UCDSectionDetail( UcdSection.NAMEDSEQUENCES, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Named_Sequences) }, 1); - public static UcdSectionDetail ProvisionalNamedSequences_Detail = - new UcdSectionDetail( + public static UCDSectionDetail ProvisionalNamedSequences_Detail = + new UCDSectionDetail( UcdSection.PROVISIONALNAMEDSEQUENCES, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(5, 0, 0), VersionInfo.getInstance(13, 0, 0), UcdProperty.Named_Sequences_Prov) }, 1); - public static UcdSectionDetail NormalizationCorrections_Detail = - new UcdSectionDetail( + public static UCDSectionDetail NormalizationCorrections_Detail = + new UCDSectionDetail( UcdSection.NORMALIZATIONCORRECTIONS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.NC_Original) }, 2); - public static UcdSectionDetail StandardizedVariants_Detail = - new UcdSectionDetail( + public static UCDSectionDetail StandardizedVariants_Detail = + new UCDSectionDetail( UcdSection.STANDARDIZEDVARIANTS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Standardized_Variant), - new UcdSectionComponent( + new UCDSectionComponent( VersionInfo.getInstance(13, 0, 0), null, UcdProperty.emoji_variation_sequence) }, 3); - public static UcdSectionDetail CJKRadicals_Detail = - new UcdSectionDetail( + public static UCDSectionDetail CJKRadicals_Detail = + new UCDSectionDetail( UcdSection.CJKRADICALS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.CJK_Radical) }, 4); - public static UcdSectionDetail EmojiSources_Detail = - new UcdSectionDetail( + public static UCDSectionDetail EmojiSources_Detail = + new UCDSectionDetail( UcdSection.EMOJISOURCES, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Emoji_DCM) }, 5); - public static UcdSectionDetail DoNotEmit_Detail = - new UcdSectionDetail( + public static UCDSectionDetail DoNotEmit_Detail = + new UCDSectionDetail( UcdSection.DONOTEMIT, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Do_Not_Emit_Type) @@ -200,11 +205,11 @@ public boolean getParserWithMissing() { 6); private final UcdSection ucdSection; - private final UcdSectionComponent[] ucdSectionComponents; + private final UCDSectionComponent[] ucdSectionComponents; private final int sortOrder; - private UcdSectionDetail( - UcdSection ucdSection, UcdSectionComponent[] ucdSectionComponents, int sortOrder) { + private UCDSectionDetail( + UcdSection ucdSection, UCDSectionComponent[] ucdSectionComponents, int sortOrder) { this.ucdSection = ucdSection; this.ucdSectionComponents = ucdSectionComponents; this.sortOrder = sortOrder; @@ -214,7 +219,7 @@ public UcdSection getSection() { return this.ucdSection; } - public UcdSectionComponent[] getUcdSectionComponents() { + public UCDSectionComponent[] getUcdSectionComponents() { return this.ucdSectionComponents; } diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java similarity index 85% rename from unicodetools/src/main/java/org/unicode/xml/UcdXML.java rename to unicodetools/src/main/java/org/unicode/xml/UCDXML.java index c71ac10826..a07be9c21f 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -2,9 +2,22 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.util.VersionInfo; -import java.io.*; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.transform.TransformerConfigurationException; @@ -14,7 +27,12 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -public class UcdXML { +/** + * Utility for generating UCDXML files. + * The utility can build flat or grouped versions of UCDXML for non-Unihan code points, Unihan code points, or the + * complete range of code points. + */ +public class UCDXML { private static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; @@ -72,7 +90,7 @@ public static void main(String[] args) throws Exception { if (options[HELP].doesOccur) { System.out.println( - "UcdXML --ucdversion {version number} --outputfolder {destination} " + "UCDXML --ucdversion {version number} --outputfolder {destination} " + "--range [ALL|NOUNIHAN|UNIHAN] --output [FLAT|GROUPED]"); System.exit(0); } @@ -173,11 +191,11 @@ private static void buildUcdXMLFile( UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType) throws IOException, TransformerConfigurationException, SAXException { - int lowCodepoint = 0x0; - int highCodepoint = 0x10FFFF; + int lowCodePoint = 0x0; + int highCodePoint = 0x10FFFF; // Tangut - // int lowCodepoint = 0x17000; - // int highCodepoint = 0x1B2FB; + // int lowCodePoint = 0x17000; + // int highCodePoint = 0x1B2FB; // 0x10FFFF File tempFile = new File(destinationFolder, "temp.xml"); @@ -208,24 +226,24 @@ private static void buildUcdXMLFile( writer, attributeResolver, ucdVersion, - lowCodepoint, - highCodepoint, + lowCodePoint, + highCodePoint, outputRange, outputType); if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.BLOCKS); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NAMEDSEQUENCES); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.STANDARDIZEDVARIANTS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.BLOCKS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.NAMEDSEQUENCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.STANDARDIZEDVARIANTS); if (ucdVersion.compareTo(VersionInfo.getInstance(5, 2, 0)) >= 0) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.CJKRADICALS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.CJKRADICALS); } if (ucdVersion.compareTo(VersionInfo.getInstance(6, 0, 0)) >= 0) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.EMOJISOURCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.EMOJISOURCES); } if (ucdVersion.compareTo(VersionInfo.getInstance(16, 0, 0)) >= 0) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.DONOTEMIT); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.DONOTEMIT); } } writer.endElement("ucd"); @@ -274,34 +292,34 @@ private static void buildRepertoire( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType) throws SAXException { writer.startElement("repertoire"); { - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (isWritableCodePoint(CodePoint, outputRange, attributeResolver)) { if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { - codepoint = + CodePoint = buildGroup( writer, attributeResolver, ucdVersion, - codepoint, - highCodepoint, + CodePoint, + highCodePoint, outputRange, outputType); } else { - codepoint = + CodePoint = buildChars( writer, attributeResolver, ucdVersion, - codepoint, - highCodepoint, + CodePoint, + highCodePoint, outputRange, outputType, null); @@ -316,21 +334,21 @@ private static int buildGroup( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType) throws SAXException { - int lastCodepointInGroup = - getLastCodepointInGroup(attributeResolver, lowCodepoint, highCodepoint); + int lastCodePointInGroup = + getLastCodePointInGroup(attributeResolver, lowCodePoint, highCodePoint); AttributesImpl groupAttrs = getGroupAttributes( ucdVersion, attributeResolver, - lowCodepoint, - lastCodepointInGroup, + lowCodePoint, + lastCodePointInGroup, outputRange); writer.startElement("group", groupAttrs); @@ -339,22 +357,22 @@ private static int buildGroup( writer, attributeResolver, ucdVersion, - lowCodepoint, - lastCodepointInGroup, + lowCodePoint, + lastCodePointInGroup, outputRange, outputType, groupAttrs); writer.endElement("group"); } - return lastCodepointInGroup; + return lastCodePointInGroup; } private static int buildChars( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType, AttributesImpl groupAttrs) @@ -362,15 +380,15 @@ private static int buildChars( ArrayList range = new ArrayList<>(); Range rangeType = Range.NONRANGE; - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (attributeResolver.isUnassignedCodepoint(codepoint) + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (attributeResolver.isUnassignedCodePoint(CodePoint) || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN - && attributeResolver.isUnifiedIdeograph(codepoint))) { - Range currentRangeType = getRangeType(attributeResolver, codepoint); + && attributeResolver.isUnifiedIdeograph(CodePoint))) { + Range currentRangeType = getRangeType(attributeResolver, CodePoint); if (!range.isEmpty()) { if (!currentRangeType.equals(rangeType) || attributeResolver.isDifferentRange( - ucdVersion, codepoint, codepoint - 1)) { + ucdVersion, CodePoint, CodePoint - 1)) { if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { buildGroupedRange( @@ -388,7 +406,7 @@ private static int buildChars( range.clear(); } } - range.add(codepoint); + range.add(CodePoint); rangeType = currentRangeType; } else { if (!range.isEmpty()) { @@ -409,18 +427,18 @@ private static int buildChars( range.clear(); rangeType = Range.NONRANGE; } - if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + if (isWritableCodePoint(CodePoint, outputRange, attributeResolver)) { if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { buildGroupedChar( writer, attributeResolver, ucdVersion, - codepoint, + CodePoint, outputRange, groupAttrs); } else { buildUngroupedChar( - writer, attributeResolver, ucdVersion, codepoint, outputRange); + writer, attributeResolver, ucdVersion, CodePoint, outputRange); } } } @@ -436,38 +454,38 @@ private static int buildChars( } } } - return highCodepoint; + return highCodePoint; } private static void buildUngroupedChar( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int codepoint, + int CodePoint, UCDXMLOUTPUTRANGE outputRange) throws SAXException { AttributesImpl charAttributes = - getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); - buildChar(writer, attributeResolver, codepoint, charAttributes); + getAttributes(ucdVersion, attributeResolver, CodePoint, outputRange); + buildChar(writer, attributeResolver, CodePoint, charAttributes); } private static void buildGroupedChar( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int codepoint, + int CodePoint, UCDXMLOUTPUTRANGE outputRange, AttributesImpl groupAttrs) throws SAXException { AttributesImpl orgCharAttributes = - getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); + getAttributes(ucdVersion, attributeResolver, CodePoint, outputRange); AttributesImpl charAttributes = new AttributesImpl(); charAttributes.addAttribute( - NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(CodePoint)); - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { String qName = propDetail.getUcdProperty().getShortName(); if (qName.startsWith("cjk")) { qName = qName.substring(2); @@ -483,18 +501,18 @@ private static void buildGroupedChar( Objects.requireNonNullElse(orgCharAttributesValue, "")); } } - buildChar(writer, attributeResolver, codepoint, charAttributes); + buildChar(writer, attributeResolver, CodePoint, charAttributes); } private static void buildChar( UCDXMLWriter writer, AttributeResolver attributeResolver, - int codepoint, + int CodePoint, AttributesImpl charAttributes) throws SAXException { writer.startElement("char", charAttributes); { - HashMap nameAliases = attributeResolver.getNameAliases(codepoint); + HashMap nameAliases = attributeResolver.getNameAliases(CodePoint); if (null != nameAliases && !nameAliases.isEmpty()) { for (String alias : nameAliases.keySet()) { AttributesImpl nameAliasAt = new AttributesImpl(); @@ -543,7 +561,7 @@ private static void buildGroupedRange( attributeResolver.getHexString(range.get(range.size() - 1))); } - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { String qName = propDetail.getUcdProperty().getShortName(); if (qName.startsWith("cjk")) { qName = qName.substring(2); @@ -580,20 +598,20 @@ private static void buildUngroupedRange( } } - private static boolean isWritableCodepoint( - int codepoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { + private static boolean isWritableCodePoint( + int CodePoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { return outputRange == UCDXMLOUTPUTRANGE.ALL || (outputRange == UCDXMLOUTPUTRANGE.UNIHAN - && attributeResolver.isUnihanAttributeRange(codepoint)) + && attributeResolver.isUnihanAttributeRange(CodePoint)) || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN - && !attributeResolver.isUnifiedIdeograph(codepoint)); + && !attributeResolver.isUnifiedIdeograph(CodePoint)); } - private static Range getRangeType(AttributeResolver attributeResolver, int codepoint) { - String NChar = attributeResolver.getNChar(codepoint); - UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(codepoint); + private static Range getRangeType(AttributeResolver attributeResolver, int CodePoint) { + String NChar = attributeResolver.getNChar(CodePoint); + UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(CodePoint); - if (attributeResolver.isUnihanAttributeRange(codepoint)) { + if (attributeResolver.isUnihanAttributeRange(CodePoint)) { return Range.CJKUNIFIEDIDEOGRAPH; } if (gc.equals(UcdPropertyValues.General_Category_Values.Surrogate)) { @@ -608,44 +626,44 @@ private static Range getRangeType(AttributeResolver attributeResolver, int codep return Range.RESERVED; } - private static int getLastCodepointInGroup( - AttributeResolver attributeResolver, int lowCodepoint, int highCodepoint) { - String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodepoint); - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, codepoint))) { - return codepoint - 1; + private static int getLastCodePointInGroup( + AttributeResolver attributeResolver, int lowCodePoint, int highCodePoint) { + String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodePoint); + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, CodePoint))) { + return CodePoint - 1; } - if (codepoint == 0x20 - 1 // put the C0 controls in their own group - || codepoint == 0xa0 - 1 // put the C0 controls in their own group - || codepoint == 0x1160 - 1 // split the jamos into three groups - || codepoint == 0x11a8 - 1 // split the jamos into three groups - || codepoint == 0x1f1e6 - 1 // put the regional indicators in their own group + if (CodePoint == 0x20 - 1 // put the C0 controls in their own group + || CodePoint == 0xa0 - 1 // put the C1 controls in their own group + || CodePoint == 0x1160 - 1 // split the jamos into three groups + || CodePoint == 0x11a8 - 1 // split the jamos into three groups + || CodePoint == 0x1f1e6 - 1 // put the regional indicators in their own group ) { - return codepoint; + return CodePoint; } } - return highCodepoint; + return highCodePoint; } private static AttributesImpl getAttributes( VersionInfo version, AttributeResolver attributeResolver, - int codepoint, + int CodePoint, UCDXMLOUTPUTRANGE outputRange) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute( - NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(CodePoint)); - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { UcdProperty prop = propDetail.getUcdProperty(); if (version.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null || version.compareTo(propDetail.getMaxVersion()) < 0)) { - String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + String attrValue = attributeResolver.getAttributeValue(prop, CodePoint); boolean isAttributeIncluded = getIsAttributeIncluded( attrValue, - attributeResolver.isUnihanAttributeRange(codepoint), + attributeResolver.isUnihanAttributeRange(CodePoint), propDetail, prop, outputRange); @@ -664,12 +682,12 @@ private static AttributesImpl getAttributes( private static AttributesImpl getGroupAttributes( VersionInfo version, AttributeResolver attributeResolver, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange) { AttributesImpl attributes = new AttributesImpl(); - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { UcdProperty prop = propDetail.getUcdProperty(); if (version.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null @@ -677,9 +695,9 @@ private static AttributesImpl getGroupAttributes( int totalCount = 0; Map counters = new LinkedHashMap<>(); - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (!attributeResolver.isUnassignedCodepoint(codepoint)) { - String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (!attributeResolver.isUnassignedCodePoint(CodePoint)) { + String attrValue = attributeResolver.getAttributeValue(prop, CodePoint); int currentCount = (counters.get(attrValue) == null) ? 0 : counters.get(attrValue); currentCount++; @@ -714,7 +732,7 @@ private static AttributesImpl getGroupAttributes( boolean isAttributeIncluded = getIsAttributeIncluded( bestAttrValue, - attributeResolver.isUnihanAttributeRange(lowCodepoint), + attributeResolver.isUnihanAttributeRange(lowCodePoint), propDetail, prop, outputRange); @@ -735,7 +753,7 @@ private static AttributesImpl getGroupAttributes( private static boolean getIsAttributeIncluded( String attrValue, boolean isUnihanAttributeRange, - UcdPropertyDetail propDetail, + UCDPropertyDetail propDetail, UcdProperty prop, UCDXMLOUTPUTRANGE outputRange) { if (attrValue == null) { @@ -786,7 +804,7 @@ private static AttributesImpl getReservedAttributes( "CDATA", attributeResolver.getHexString(range.get(range.size() - 1))); } - for (UcdPropertyDetail propDetail : UcdPropertyDetail.baseValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.baseValues()) { UcdProperty prop = propDetail.getUcdProperty(); if (version.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java index ff31e69c61..178d194e34 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java @@ -1,6 +1,8 @@ package org.unicode.xml; import java.io.FileOutputStream; +import java.text.SimpleDateFormat; +import java.util.Date; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; @@ -11,6 +13,9 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +/** + * Helper class for writing the contents for the UCDXML files. + */ public class UCDXMLWriter { public static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; @@ -36,11 +41,11 @@ public UCDXMLWriter(FileOutputStream f) throws TransformerConfigurationException } public void startFile() throws SAXException { + String copyrightYear = new SimpleDateFormat("yyyy").format(new Date()); transformerHandler.startDocument(); char[] c = "\n".toCharArray(); transformerHandler.characters(c, 0, c.length); - // TODO: JRW change hardcoded 2023 to current year. - c = " \u00A9 2023 Unicode\u00AE, Inc. ".toCharArray(); + c = (" \u00A9 " + copyrightYear + " Unicode\u00AE, Inc. ").toCharArray(); transformerHandler.comment(c, 0, c.length); c = "\n".toCharArray(); transformerHandler.characters(c, 0, c.length); diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java index 396bddeb7f..331aa8c624 100644 --- a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -4,14 +4,24 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.util.*; -import java.util.Map.Entry; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.unicode.cldr.util.XMLFileReader; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.utility.Utility; import org.xml.sax.*; +/** + * Helper class for org.unicode.xml.CompareUCDXML. + * Facilitates traversal of the contents of a UCDXML file. + */ public class XMLProperties { enum XmlLeaf { @@ -187,7 +197,7 @@ public void startElement( case SURROGATE: case NONCHARACTER: parseCp(attributes); - for (final Entry entry : attributes.entrySet()) { + for (final Map.Entry entry : attributes.entrySet()) { doAttributes(entry.getKey(), entry.getValue()); } if (xmlLeaf == XmlLeaf.NONCHARACTER) { diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index.xml b/unicodetools/src/main/resources/org/unicode/uax42/index.xml index 6b4733a2b0..c0f05f5c2c 100644 --- a/unicodetools/src/main/resources/org/unicode/uax42/index.xml +++ b/unicodetools/src/main/resources/org/unicode/uax42/index.xml @@ -12,7 +12,7 @@ stage='proposed-update' schema='rnc' prevrev='34'/> - 2024 + 2025 @@ -21,7 +21,7 @@ - + New value for the age attribute: 16.0. From cb8200425060b9e68dd37135cb760d6b389ee168 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Thu, 6 Feb 2025 12:51:31 -0800 Subject: [PATCH 03/10] Merged changes manually from ucdxml --- .gitignore | 1 + docs/ucdxml.md | 22 + .../java/org/unicode/props/UcdProperty.java | 4 + .../org/unicode/props/UcdPropertyValues.java | 4 + .../org/unicode/xml/AttributeResolver.java | 336 ++ .../java/org/unicode/xml/CompareUcdXML.java | 197 + .../unicode/xml/GeneratePropertyValues.java | 1749 +++++++++ .../java/org/unicode/xml/UCDDataResolver.java | 210 + .../java/org/unicode/xml/UCDXMLWriter.java | 74 + .../org/unicode/xml/UcdPropertyDetail.java | 2356 +++++++++++ .../org/unicode/xml/UcdSectionComponent.java | 28 + .../org/unicode/xml/UcdSectionDetail.java | 224 ++ .../src/main/java/org/unicode/xml/UcdXML.java | 825 ++++ .../java/org/unicode/xml/XMLProperties.java | 482 +++ .../unicode/props/ExtraPropertyAliases.txt | 6 +- .../org/unicode/props/IndexPropertyRegex.txt | 51 +- .../unicode/props/IndexUnicodeProperties.txt | 22 +- .../org/unicode/uax42/fragments/Bidi_C.xml | 5 + .../org/unicode/uax42/fragments/Bidi_M.xml | 5 + .../org/unicode/uax42/fragments/Emoji.xml | 20 + .../org/unicode/uax42/fragments/InCB.xml | 9 + .../org/unicode/uax42/fragments/InPC.xml | 21 + .../org/unicode/uax42/fragments/InSC.xml | 42 + .../org/unicode/uax42/fragments/JSN.xml | 5 + .../org/unicode/uax42/fragments/Join_C.xml | 5 + .../unicode/uax42/fragments/Name_Alias.xml | 10 + .../org/unicode/uax42/fragments/Nushu.xml | 8 + .../uax42/fragments/Set_of_code_points.xml | 8 + .../org/unicode/uax42/fragments/Tangut.xml | 18 + .../org/unicode/uax42/fragments/Unihan.xml | 347 ++ .../org/unicode/uax42/fragments/age.xml | 23 + .../org/unicode/uax42/fragments/bc.xml | 17 + .../org/unicode/uax42/fragments/blk.xml | 344 ++ .../org/unicode/uax42/fragments/block.xml | 10 + .../org/unicode/uax42/fragments/bmg.xml | 5 + .../org/unicode/uax42/fragments/boolean.xml | 4 + .../unicode/uax42/fragments/boundaries.xml | 58 + .../org/unicode/uax42/fragments/bpb.xml | 5 + .../org/unicode/uax42/fragments/bpt.xml | 5 + .../unicode/uax42/fragments/case_folding.xml | 8 + .../unicode/uax42/fragments/case_mapping.xml | 11 + .../unicode/uax42/fragments/case_other.xml | 32 + .../org/unicode/uax42/fragments/casing.xml | 14 + .../org/unicode/uax42/fragments/ccc.xml | 5 + .../unicode/uax42/fragments/cjk-radicals.xml | 10 + .../org/unicode/uax42/fragments/cjkEACC.xml | 5 + .../uax42/fragments/cjkIRG_TSource.xml | 6 + .../unicode/uax42/fragments/composition.xml | 8 + .../org/unicode/uax42/fragments/datatypes.xml | 5 + .../uax42/fragments/datatypes_code_points.xml | 9 + .../unicode/uax42/fragments/decomposition.xml | 11 + .../unicode/uax42/fragments/description.xml | 6 + .../unicode/uax42/fragments/do-not-emit.xml | 22 + .../org/unicode/uax42/fragments/ea.xml | 5 + .../unicode/uax42/fragments/emoji-sources.xml | 11 + .../uax42/fragments/function_graphic.xml | 68 + .../org/unicode/uax42/fragments/gc.xml | 12 + .../org/unicode/uax42/fragments/groups.xml | 8 + .../org/unicode/uax42/fragments/hst.xml | 5 + .../unicode/uax42/fragments/identifier.xml | 26 + .../unicode/uax42/fragments/ideographs.xml | 23 + .../org/unicode/uax42/fragments/isc.xml | 5 + .../uax42/fragments/jis-code-point.xml | 5 + .../org/unicode/uax42/fragments/joining.xml | 53 + .../org/unicode/uax42/fragments/lb.xml | 24 + .../unicode/uax42/fragments/miscellaneous.xml | 11 + .../org/unicode/uax42/fragments/na.xml | 13 + .../org/unicode/uax42/fragments/na1.xml | 5 + .../uax42/fragments/named-sequences.xml | 15 + .../org/unicode/uax42/fragments/namespace.xml | 5 + .../fragments/normalization-corrections.xml | 11 + .../org/unicode/uax42/fragments/numeric.xml | 8 + .../org/unicode/uax42/fragments/pattern.xml | 8 + .../unicode/uax42/fragments/quickcheck.xml | 31 + .../unicode/uax42/fragments/repertoire.xml | 6 + .../fragments/repertoire_Code_points.xml | 23 + .../org/unicode/uax42/fragments/script.xml | 49 + .../uax42/fragments/simple_case_mapping.xml | 11 + .../uax42/fragments/standardized-variants.xml | 10 + .../org/unicode/uax42/fragments/start.xml | 6 + .../resources/org/unicode/uax42/index.xml | 1353 +++++++ .../org/unicode/uax42/index2html.xsl | 611 +++ .../resources/org/unicode/uax42/index2rnc.xsl | 45 + .../org/unicode/uax42/output/index.html | 3482 +++++++++++++++++ .../org/unicode/uax42/output/index.rnc | 1455 +++++++ .../main/resources/org/unicode/uax42/pom.xml | 72 + 86 files changed, 15162 insertions(+), 30 deletions(-) create mode 100644 docs/ucdxml.md create mode 100644 unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/UcdXML.java create mode 100644 unicodetools/src/main/java/org/unicode/xml/XMLProperties.java create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/index.xml create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/output/index.html create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc create mode 100644 unicodetools/src/main/resources/org/unicode/uax42/pom.xml diff --git a/.gitignore b/.gitignore index 60e7ec63ef..c6d5a34bd2 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ perf-*.xml test-*.xml # Directories +.idea/ .settings/ .vs/ .vscode/ diff --git a/docs/ucdxml.md b/docs/ucdxml.md new file mode 100644 index 0000000000..207842db2a --- /dev/null +++ b/docs/ucdxml.md @@ -0,0 +1,22 @@ +# Generating TR42 + +## Step 1 - Generate property value fragments + +- mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.GeneratePropertyValues"' '-Dexec.args="--ucdversion 16.0.0 -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42/fragments; pwd)"' -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) + +## Step 2 - Generate TR42 index.html and index.rnc + +- mvn xml:transform -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42/fragments; pwd) -Doutputdir=../Generated/uax42/ + +## Step 3 - Validate generated UAX XML files + +You'll need a [RELAX NG](https://relaxng.org/) schema validator. We'll use [jing-trang](https://github. +com/relaxng/jing-trang) in this example. + +1. Clone and build [jing-trang](https://github.com/relaxng/jing-trang) +2. Run the following: + ``` + java -jar C:\_git\jing-trang\build\jing.jar -c UNICODETOOLS_REPO_DIR\uax\uax42\output\index.rnc + ``` + Note that the UAX xml file has to be saved as NFD as the Unihan syntax regular expressions are expecting NFD. + diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index 914168c903..60efa304db 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -85,6 +85,9 @@ public enum UcdProperty { Emoji_SB(PropertyType.Miscellaneous, "ESB"), ISO_Comment(PropertyType.Miscellaneous, "isc"), Jamo_Short_Name(PropertyType.Miscellaneous, "JSN"), + NC_Corrected(PropertyType.Miscellaneous, "ncCorrected"), + NC_Original(PropertyType.Miscellaneous, "ncOriginal"), + NC_Version(PropertyType.Miscellaneous, "ncVersion"), Name(PropertyType.Miscellaneous, "na"), Name_Alias(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "Name_Alias"), Named_Sequences(PropertyType.Miscellaneous, "NS"), @@ -100,6 +103,7 @@ public enum UcdProperty { null, ValueCardinality.Unordered, "cjkAlternateTotalStrokes"), + emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"), kBigFive(PropertyType.Miscellaneous, "cjkBigFive"), kCCCII(PropertyType.Miscellaneous, "cjkCCCII"), kCNS1986(PropertyType.Miscellaneous, "cjkCNS1986"), diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index f8bb22f1a4..0aac98c263 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -766,6 +766,7 @@ public static East_Asian_Width_Values forName(String name) { // Emoji_DCM // Emoji_KDDI // Emoji_SB + // emoji_variation_sequence // Equivalent_Unified_Ideograph // FC_NFKC_Closure public enum General_Category_Values implements Named { @@ -1668,6 +1669,9 @@ public static Line_Break_Values forName(String name) { // Name_Alias // Named_Sequences // Named_Sequences_Prov + // NC_Corrected + // NC_Original + // NC_Version public enum NFC_Quick_Check_Values implements Named { Maybe("M"), No("N"), diff --git a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java new file mode 100644 index 0000000000..ccb4984ec7 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java @@ -0,0 +1,336 @@ +package org.unicode.xml; + +import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.util.VersionInfo; +import java.util.*; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.props.*; + +public class AttributeResolver { + + private final IndexUnicodeProperties indexUnicodeProperties; + private final UnicodeMap map_age; + private final UnicodeMap map_block; + private final UnicodeMap map_decomposition_type; + private final UnicodeMap map_general_category; + private final UnicodeMap map_script; + private final UnicodeMap map_script_extensions; + private final HashMap> map_NameAlias; + + // If there is a change in any of these properties between two adjacent characters, it will + // result in a new range. + private final UcdPropertyDetail[] rangeDefiningPropertyDetails = { + UcdPropertyDetail.Age_Detail, + UcdPropertyDetail.Bidi_Class_Detail, + UcdPropertyDetail.Block_Detail, + UcdPropertyDetail.Decomposition_Mapping_Detail, + UcdPropertyDetail.Numeric_Type_Detail, + UcdPropertyDetail.Numeric_Value_Detail, + UcdPropertyDetail.Vertical_Orientation_Detail + }; + + public AttributeResolver(IndexUnicodeProperties iup) { + indexUnicodeProperties = iup; + map_age = indexUnicodeProperties.loadEnum(UcdProperty.Age); + map_block = indexUnicodeProperties.loadEnum(UcdProperty.Block); + map_decomposition_type = indexUnicodeProperties.loadEnum(UcdProperty.Decomposition_Type); + map_general_category = indexUnicodeProperties.loadEnum(UcdProperty.General_Category); + map_script = indexUnicodeProperties.loadEnum(UcdProperty.Script); + map_script_extensions = + indexUnicodeProperties.getProperty(UcdProperty.Script_Extensions).getUnicodeMap(); + + // UCD code is only set up to read a single Alias value from NameAliases.txt + // Instead, we'll load the Alias and the Type data as part of the constructor. We'll keep in + // memory as it + // NameAliases isn't too large. + map_NameAlias = loadNameAliases(); + } + + protected enum AliasType { + ABBREVIATION("abbreviation"), + ALTERNATE("alternate"), + CONTROL("control"), + CORRECTION("correction"), + FIGMENT("figment"), + NONE("none"); + + private final String aliasType; + + AliasType(String aliasType) { + this.aliasType = aliasType; + } + + public String toString() { + return aliasType; + } + } + + private static class NameAlias { + + private String alias; + private final AliasType type; + + private NameAlias(String alias, AliasType type) { + this.alias = alias; + this.type = type; + } + + public String getAlias() { + return alias; + } + + public AliasType getType() { + return type; + } + } + + private static class NameAliasComparator implements java.util.Comparator { + + @Override + public int compare(NameAlias o1, NameAlias o2) { + return o1.getAlias().compareTo(o2.getAlias()); + } + } + + private HashMap> loadNameAliases() { + HashMap> nameAliasesByCodepoint = new HashMap<>(); + final PropertyParsingInfo fileInfo = + PropertyParsingInfo.getPropertyInfo(UcdProperty.Name_Alias); + String fullFilename = fileInfo.getFullFileName(indexUnicodeProperties.getUcdVersion()); + UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename)); + NameAliasComparator nameAliasComparator = new NameAliasComparator(); + + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + int codepoint = Integer.parseInt(parts[0], 16); + NameAlias nameAlias; + if (parts.length < 3) { + nameAlias = new NameAlias(parts[1], AliasType.NONE); + } else { + nameAlias = + new NameAlias( + parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT))); + } + + if (nameAliasesByCodepoint.containsKey(codepoint)) { + LinkedList nameAliases = + new LinkedList<>(nameAliasesByCodepoint.get(codepoint)); + nameAliases.add(nameAlias); + nameAliases.sort(nameAliasComparator); + nameAliasesByCodepoint.replace(codepoint, nameAliases); + } else { + nameAliasesByCodepoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); + } + } + return nameAliasesByCodepoint; + } + + public String getAttributeValue(UcdProperty prop, int codepoint) { + String resolvedValue = indexUnicodeProperties.getResolvedValue(prop, codepoint); + switch (prop.getType()) { + case Numeric: + switch (prop) { + case kOtherNumeric: + case kPrimaryNumeric: + case kAccountingNumeric: + return (resolvedValue.equals("NaN")) ? null : resolvedValue; + default: + return Optional.ofNullable(resolvedValue).orElse("NaN"); + } + case String: + switch (prop) { + case Equivalent_Unified_Ideograph: + String EqUIdeo = getMappingValue(codepoint, resolvedValue, false, ""); + return (EqUIdeo.equals("#")) ? null : EqUIdeo; + case kCompatibilityVariant: + String kCompatibilityVariant = + getMappingValue(codepoint, resolvedValue, false, "U+"); + return (kCompatibilityVariant.equals("#")) ? "" : kCompatibilityVariant; + case kSimplifiedVariant: + case kTraditionalVariant: + String kVariant = + getMappingValue( + codepoint, + resolvedValue, + isUnihanAttributeRange(codepoint), + "U+"); + return (kVariant.equals("#")) ? "" : kVariant; + case Bidi_Mirroring_Glyph: + // Returning empty string for bmg to maintain compatibility with older + // generated files. + String bmg = getMappingValue(codepoint, resolvedValue, false, ""); + return (bmg.equals("#")) ? "" : bmg; + default: + return getMappingValue(codepoint, resolvedValue, false, ""); + } + case Miscellaneous: + switch (prop) { + case Jamo_Short_Name: + // return map_jamo_short_name.get(codepoint).getShortName(); + return Optional.ofNullable(resolvedValue).orElse(""); + case Name: + if (resolvedValue != null + && resolvedValue.startsWith("CJK UNIFIED IDEOGRAPH-")) { + return "CJK UNIFIED IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("CJK COMPATIBILITY IDEOGRAPH-")) { + return "CJK COMPATIBILITY IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("TANGUT IDEOGRAPH-")) { + return "TANGUT IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("KHITAN SMALL SCRIPT CHARACTER-")) { + return "KHITAN SMALL SCRIPT CHARACTER-#"; + } + if (resolvedValue != null && resolvedValue.startsWith("NUSHU CHARACTER-")) { + return "NUSHU CHARACTER-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("EGYPTIAN HIEROGLYPH-")) { + return "EGYPTIAN HIEROGLYPH-#"; + } + return Optional.ofNullable(resolvedValue).orElse(""); + case kDefinition: + return resolvedValue; + default: + if (resolvedValue != null) { + return resolvedValue.replaceAll("\\|", " "); + } + return ""; + } + case Catalog: + switch (prop) { + case Age: + String age = map_age.get(codepoint).getShortName(); + return (age.equals("NA")) ? "unassigned" : age; + case Block: + return map_block.get(codepoint).getShortName(); + case Script: + return map_script.get(codepoint).getShortName(); + case Script_Extensions: + StringBuilder extensionBuilder = new StringBuilder(); + String[] extensions = map_script_extensions.get(codepoint).split("\\|", 0); + for (String extension : extensions) { + extensionBuilder.append( + UcdPropertyValues.Script_Values.valueOf(extension) + .getShortName()); + extensionBuilder.append(" "); + } + return extensionBuilder.toString().trim(); + default: + throw new RuntimeException("Missing Catalog case"); + } + case Enumerated: + switch (prop) { + case Decomposition_Type: + // Returning lower case to maintain compatibility with older generated + // files. + return map_decomposition_type + .get(codepoint) + .getShortName() + .toLowerCase(Locale.ROOT); + default: + final UnicodeProperty property = indexUnicodeProperties.getProperty(prop); + final List valueAliases = property.getValueAliases(property.getValue(codepoint)); + return valueAliases.get(0); + } + case Binary: + { + switch (resolvedValue) { + // Seems overkill to get this from UcdPropertyValues.Binary + case "No": + return "N"; + case "Yes": + return "Y"; + default: + throw new RuntimeException("Unexpected Binary value"); + } + } + default: + throw new RuntimeException("Missing PropertyType case"); + } + } + + public boolean isUnassignedCodepoint(int codepoint) { + return UcdPropertyValues.General_Category_Values.Unassigned.equals(getgc(codepoint)) + || UcdPropertyValues.General_Category_Values.Private_Use.equals(getgc(codepoint)) + || UcdPropertyValues.General_Category_Values.Surrogate.equals(getgc(codepoint)); + } + + public UcdPropertyValues.General_Category_Values getgc(int codepoint) { + return map_general_category.get(codepoint); + } + + public String getNChar(int codepoint) { + return getAttributeValue(UcdProperty.Noncharacter_Code_Point, codepoint); + } + + public HashMap getNameAliases(int codepoint) { + HashMap nameAliases = new LinkedHashMap<>(); + LinkedList nameAliasList = map_NameAlias.get(codepoint); + if (null != nameAliasList && !nameAliasList.isEmpty()) { + for (NameAlias nameAlias : nameAliasList) { + nameAliases.put(nameAlias.getAlias(), nameAlias.getType().toString()); + } + return nameAliases; + } + return null; + } + + private String getMappingValue( + int codepoint, String resolvedValue, boolean ignoreUnihanRange, String prefix) { + if (null == resolvedValue) { + return "#"; + } + int[] resolvedValueInts = resolvedValue.codePoints().toArray(); + if (resolvedValueInts.length == 1 + && resolvedValueInts[0] == codepoint + && !ignoreUnihanRange) { + return "#"; + } + StringBuilder sb = new StringBuilder(); + for (int i : resolvedValueInts) { + sb.append(prefix).append(getCPString(i)).append(" "); + } + return sb.toString().trim(); + } + + public boolean isDifferentRange(VersionInfo ucdVersion, int codepointA, int codepointB) { + boolean isDifference = false; + for (UcdPropertyDetail propDetail : rangeDefiningPropertyDetails) { + UcdProperty prop = propDetail.getUcdProperty(); + if (ucdVersion.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || ucdVersion.compareTo(propDetail.getMaxVersion()) < 0)) { + isDifference = + isDifference + || !getAttributeValue(prop, codepointA) + .equals(getAttributeValue(prop, codepointB)); + } + } + return isDifference; + } + + private static String getCPString(int codepoint) { + return String.format("%4s", Integer.toHexString(codepoint)) + .replace(" ", "0") + .toUpperCase(Locale.ROOT); + } + + public String getHexString(int codepoint) { + return getCPString(codepoint); + } + + public boolean isUnihanAttributeRange(int codepoint) { + return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y") + || !getAttributeValue(UcdProperty.kCompatibilityVariant, codepoint).isEmpty(); + } + + public boolean isUnifiedIdeograph(int codepoint) { + return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y") + && getAttributeValue(UcdProperty.Name, codepoint).equals("CJK UNIFIED IDEOGRAPH-#"); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java b/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java new file mode 100644 index 0000000000..52120eda45 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java @@ -0,0 +1,197 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import java.io.*; +import java.util.HashMap; +import java.util.Objects; +import org.unicode.props.UcdProperty; + +public class CompareUcdXML { + + private static final String NEWLINE = System.getProperty("line.separator"); + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("fileA", 'a', UOption.REQUIRES_ARG), + UOption.create("fileB", 'b', UOption.REQUIRES_ARG) + }; + + private static final UcdProperty[] codepointSequenceProperties = + new UcdProperty[] { + UcdProperty.Named_Sequences, + UcdProperty.Named_Sequences_Prov, + UcdProperty.Standardized_Variant, + UcdProperty.Emoji_DCM, + UcdProperty.Emoji_KDDI, + UcdProperty.Emoji_SB, + UcdProperty.Do_Not_Emit_Preferred + }; + + private static final HashMap knownDifferences; + + static { + knownDifferences = new HashMap<>(); + + // https://github.com/unicode-org/properties/issues/296 + knownDifferences.put(0x31E4, new String[] {"Hani", "Zyyy"}); + knownDifferences.put(0x31E5, new String[] {"Hani", "Zyyy"}); + + // https://github.com/unicode-org/unicodetools/issues/325 + knownDifferences.put(0x109F7, new String[] {"1/6", "2/12"}); + knownDifferences.put(0x109F8, new String[] {"1/4", "3/12"}); + knownDifferences.put(0x109F9, new String[] {"1/3", "4/12"}); + knownDifferences.put(0x109FB, new String[] {"1/2", "6/12"}); + knownDifferences.put(0x109FD, new String[] {"2/3", "8/12"}); + knownDifferences.put(0x109FE, new String[] {"3/4", "9/12"}); + knownDifferences.put(0x109FF, new String[] {"5/6", "10/12"}); + + // https://github.com/unicode-org/properties/issues/172 + knownDifferences.put(0x5146, new String[] {"1000000", "1000000 1000000000000"}); + knownDifferences.put(0x79ED, new String[] {"1000000000", "1000000000 1000000000000"}); + } + + private static final int HELP = 0, FILE_A = 1, FILE_B = 2, LOGFILE = 3; + + public static void main(String[] args) throws Exception { + File fileA = null; + File fileB = null; + int errorCount = 0; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println("CompareUcdXML --fileA {file path} --fileB {file path}"); + System.exit(0); + } + + if (options[FILE_A].doesOccur) { + try { + fileA = new File(options[FILE_A].value); + if (!fileA.exists()) { + throw new IOException(); + } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find " + options[FILE_A].value); + } + } else { + throw new IllegalArgumentException("Missing command line option: --fileA (or -a)"); + } + + if (options[FILE_B].doesOccur) { + try { + fileB = new File(options[FILE_B].value); + if (!fileB.exists()) { + throw new IOException(); + } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find " + options[FILE_B].value); + } + } else { + throw new IllegalArgumentException("Missing command line option: --fileB (or -b)"); + } + + System.out.println("Comparing " + fileA + " and " + fileB); + + final XMLProperties xmlPropsA = new XMLProperties(fileA); + final XMLProperties xmlPropsB = new XMLProperties(fileB); + + // First, iterate through the UcdProperties on each codepoint. + for (final UcdProperty prop : UcdProperty.values()) { + UnicodeMap fileAMap = xmlPropsA.getMap(prop); + UnicodeMap fileBMap = xmlPropsB.getMap(prop); + if (!fileAMap.equals(fileBMap)) { + for (int i = 0; i <= 0x10ffff; ++i) { + try { + String xmlValA = fileAMap.get(i); + String xmlValB = fileBMap.get(i); + if (!Objects.equals(xmlValA, xmlValB)) { + // At least one string is != null and the strings are different, but we + // don't care if one + // is null and one is empty_string + // As far as we care, empty_string == null == "00000" + int lenA = + (xmlValA == null + ? 0 + : (xmlValA.equals("00000") ? 0 : xmlValA.length())); + int lenB = + (xmlValB == null + ? 0 + : (xmlValB.equals("00000") ? 0 : xmlValB.length())); + if (!(lenA == 0 && lenB == 0) + && !isKnownDifference(i, xmlValA, xmlValB)) { + errorCount++; + System.out.println( + "For UCDProperty " + + prop.name() + + " (" + + prop.getShortName() + + ") [" + + String.format("0x%04X", i) + + "], "); + System.out.println("\t" + fileA + " = " + xmlValA); + System.out.println("\t" + fileB + " = " + xmlValB); + } + } + } catch (Exception e) { + System.out.println("Exception thrown for " + String.format("0x%04X", i)); + System.out.println(e.getMessage()); + } + } + } + } + // Now handle anything that contains codepoint sequences. + for (UcdProperty prop : codepointSequenceProperties) { + UnicodeMap fileAMap = xmlPropsA.getMap(prop); + UnicodeMap fileBMap = xmlPropsB.getMap(prop); + UnicodeSet differences = fileAMap.keySet().addAll(fileBMap.keySet()); + for (String key : differences) { + try { + String xmlValA = fileAMap.get(key); + String xmlValB = fileBMap.get(key); + if (!Objects.equals(xmlValA, xmlValB)) { + // At least one string is != null and the strings are different, but we + // don't care if one + // is null and one is empty_string + // As far as we care, empty_string == null == "00000" + int lenA = + (xmlValA == null + ? 0 + : (xmlValA.equals("00000") ? 0 : xmlValA.length())); + int lenB = + (xmlValB == null + ? 0 + : (xmlValB.equals("00000") ? 0 : xmlValB.length())); + if (!(lenA == 0 && lenB == 0)) { + errorCount++; + System.out.println( + "For UCDProperty " + + prop.name() + + " (" + + prop.getShortName() + + ") [" + + key + + "], "); + System.out.println("\t" + fileA + " = " + xmlValA); + System.out.println("\t" + fileB + " = " + xmlValB); + } + } + } catch (Exception e) { + System.out.println("Exception thrown for " + String.format("0x%04X", key)); + System.out.println(e.getMessage()); + } + } + } + System.exit(errorCount); + } + + private static boolean isKnownDifference(int codepoint, String xmlValA, String xmlValB) { + if (knownDifferences.containsKey(codepoint)) { + String knownValue1 = knownDifferences.get(codepoint)[0]; + String knownValue2 = knownDifferences.get(codepoint)[1]; + return (knownValue1.equals(xmlValA) && knownValue2.equals(xmlValB)) + || (knownValue1.equals(xmlValB) && knownValue2.equals(xmlValA)); + } + return false; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java new file mode 100644 index 0000000000..f8a0dfa279 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -0,0 +1,1749 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.util.VersionInfo; +import java.io.*; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues.*; + +public class GeneratePropertyValues { + + private enum VALUESOUTPUTTYPE { + VALUE_PER_LINE, + ALPHABETICAL_GROUP, + NUMERICAL_GROUP, + MAX_LINE_LENGTH; + } + + private enum SCHEMA { + // Manual indicates a fragment file that is maintained manually rather than generated from + // this utility. + // Manual + NAMESPACE("namespace"), + // Manual + DATATYPES("datatypes"), + // Manual + START("start"), + BOOLEAN("boolean"), + // Manual + DESCRIPTION("description"), + // Manual + REPERTOIRE("repertoire"), + PROPERTIES("properties"), + TANGUT("tangut"), + NUSHU("nushu"), + EMOJI_DATA("emoji-data"), + // Manual + BLOCK("block"), + // Manual + NAMED_SEQUENCES("named-sequences"), + // Manual + NORMALIZATION_CORRECTIONS("normalization-corrections"), + // Manual + STANDARDIZED_VARIANTS("standardized-variants"), + // Manual + CJK_RADICALS("cjk-radicals"), + // Manual + EMOJI_SOURCES("emoji-sources"), + DO_NOT_EMIT("do-not-emit"); + + final String name; + + SCHEMA(String name) { + this.name = name; + } + + String getName() { + return this.name; + } + } + + private static final class TR38Details { + boolean isList; + String syntax; + + public TR38Details(boolean isList, String syntax) { + this.isList = isList; + this.syntax = syntax; + } + + public boolean isList() { + return isList; + } + + public String getSyntax() { + return syntax; + } + } + + private static final int MAX_LINE_LENGTH = 70; + private static final String NEWLINE = System.lineSeparator(); + private static final String DOUBLELINE = System.lineSeparator() + System.lineSeparator(); + private static final String TRIPLELINE = + System.lineSeparator() + System.lineSeparator() + System.lineSeparator(); + private static File destinationFolder = null; + + private static HashMap syntaxTR38; + private static final String NAMESPACE = "http://unicode.org/ns/2001/ucdxml"; + private static final String TR38URL = "https://www.unicode.org/reports/tr38"; + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("ucdversion", 'v', UOption.REQUIRES_ARG), + UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + }; + + private static final int HELP = 0, UCDVERSION = 1, OUTPUTFOLDER = 2; + + public static void main(String[] args) throws Exception { + + VersionInfo ucdVersion = null; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println( + "GeneratePropertyValuesList --ucdversion {version number} --outputfolder {destination}"); + System.exit(0); + } + + try { + if (options[UCDVERSION].doesOccur) { + try { + ucdVersion = VersionInfo.getInstance(options[UCDVERSION].value); + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[UCDVERSION].value + + " to a valid UCD version"); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --ucdversion (or -v)"); + } + if (options[OUTPUTFOLDER].doesOccur) { + try { + destinationFolder = new File(options[OUTPUTFOLDER].value); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdir()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + options[OUTPUTFOLDER].value); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --outputfolder (or -f)"); + } + + } catch (Exception e) { + System.err.println(e.getMessage()); + System.exit(1); + } + + if (ucdVersion != null && destinationFolder.exists()) { + buildPropertyValues(ucdVersion); + System.out.println("End"); + System.exit(0); + } else { + System.err.println("Unexpected error when building UcdXML file."); + System.exit(1); + } + } + + private static void buildPropertyValues( + // It would be nice to be able to generate values by ucdVersion. Leaving this here for + // now... + VersionInfo ucdVersion) throws IOException, URISyntaxException { + syntaxTR38 = parseTR38(); + + createPropertyFragment( + SCHEMA.BOOLEAN, + getFormattedValues(SCHEMA.BOOLEAN, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Age, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Age, VALUESOUTPUTTYPE.NUMERICAL_GROUP)); + createPropertyFragment( + UcdProperty.Name, SCHEMA.PROPERTIES, getFormattedSyntax(UcdProperty.Name)); + createPropertyFragment( + UcdProperty.Unicode_1_Name, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Unicode_1_Name)); + createPropertyFragment( + UcdProperty.Name_Alias.getShortName() + ".xml", + "name-alias element", + SCHEMA.PROPERTIES, + getFormattedElement(UcdProperty.Name_Alias)); + createPropertyFragment( + UcdProperty.Block, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Block, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.General_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.General_Category, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.Canonical_Combining_Class, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Canonical_Combining_Class, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Bidi_Class, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Bidi_Class, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.Bidi_Mirrored, + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Bidi_Mirrored)); + createPropertyFragment( + UcdProperty.Bidi_Mirroring_Glyph, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Bidi_Mirroring_Glyph)); + createPropertyFragment( + UcdProperty.Bidi_Control, + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Bidi_Control)); + createPropertyFragment( + UcdProperty.Bidi_Paired_Bracket_Type, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Bidi_Paired_Bracket_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Bidi_Paired_Bracket, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Bidi_Paired_Bracket)); + createPropertyFragment( + "decomposition.xml", + "decomposition properties", + SCHEMA.PROPERTIES, + getFormattedDecompositionProperties()); + createPropertyFragment( + "composition.xml", + "composition properties", + SCHEMA.PROPERTIES, + getFormattedCompositionProperties()); + createPropertyFragment( + "quickcheck.xml", + "quick check properties", + SCHEMA.PROPERTIES, + getFormattedQuickCheckProperties()); + createPropertyFragment( + "numeric.xml", + "numeric properties", + SCHEMA.PROPERTIES, + getFormattedNumericProperties()); + createPropertyFragment( + "joining.xml", + "joining properties", + SCHEMA.PROPERTIES, + getFormattedJoiningProperties()); + createPropertyFragment( + UcdProperty.Join_Control.getShortName() + ".xml", + "joining properties", + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Join_Control)); + createPropertyFragment( + UcdProperty.Line_Break, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Line_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.East_Asian_Width, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.East_Asian_Width, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + "casing.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCasingProperties()); + createPropertyFragment( + "simple_case_mapping.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedSimpleCaseMappingProperties()); + createPropertyFragment( + "case_mapping.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseMappingProperties()); + createPropertyFragment( + "case_folding.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseFoldingProperties()); + createPropertyFragment( + "case_other.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseOtherProperties()); + createPropertyFragment( + "script.xml", + "script properties", + SCHEMA.PROPERTIES, + getFormattedScriptProperties()); + createPropertyFragment( + UcdProperty.ISO_Comment, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.ISO_Comment)); + createPropertyFragment( + UcdProperty.Hangul_Syllable_Type, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Hangul_Syllable_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Jamo_Short_Name, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Jamo_Short_Name)); + createPropertyFragment( + UcdProperty.Indic_Syllabic_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Syllabic_Category, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Indic_Positional_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Positional_Category, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Indic_Conjunct_Break, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Conjunct_Break, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + "identifier.xml", + "identifier properties", + SCHEMA.PROPERTIES, + getFormattedIdentifierProperties()); + createPropertyFragment( + "pattern.xml", + "pattern properties", + SCHEMA.PROPERTIES, + getFormattedPatternProperties()); + createPropertyFragment( + "function_graphic.xml", + "properties related to function and graphic characteristics", + SCHEMA.PROPERTIES, + getFormattedFunctionGraphicProperties()); + createPropertyFragment( + "boundaries.xml", + "properties related to boundaries", + SCHEMA.PROPERTIES, + getFormattedBoundaryProperties()); + createPropertyFragment( + "ideographs.xml", + "properties related to ideographs", + SCHEMA.PROPERTIES, + getFormattedIdeographProperties()); + createPropertyFragment( + "miscellaneous.xml", + "miscellaneous properties", + SCHEMA.PROPERTIES, + getFormattedMiscellaneousProperties()); + createPropertyFragment( + "Unihan.xml", + "Unihan properties", + SCHEMA.PROPERTIES, + getFormattedUnihanProperties()); + createPropertyFragment( + "Tangut.xml", "Tangut data", SCHEMA.TANGUT, getFormattedTangutProperties()); + createPropertyFragment( + "Nushu.xml", "Nushu data", SCHEMA.NUSHU, getFormattedNushuProperties()); + createPropertyFragment( + "Emoji.xml", "Emoji properties", SCHEMA.EMOJI_DATA, getFormattedEmojiProperties()); + createPropertyFragment( + "do-not-emit.xml", + "do-not-emit", + SCHEMA.DO_NOT_EMIT, + getFormattedDoNotEmit(VALUESOUTPUTTYPE.VALUE_PER_LINE)); + } + + private static void createPropertyFragment(SCHEMA schema, String formattedFragment) + throws IOException { + createPropertyFragment( + schema.getName() + ".xml", schema.getName(), schema, formattedFragment); + } + + private static void createPropertyFragment( + UcdProperty ucdProperty, SCHEMA schema, String formattedFragment) throws IOException { + createPropertyFragment( + ucdProperty.getShortName() + ".xml", + ucdProperty.getShortName() + " attribute", + schema, + formattedFragment); + } + + private static void createPropertyFragment( + String filename, String title, SCHEMA schema, String formattedFragment) + throws IOException { + BufferedWriter writer = getFragmentWriter(filename); + writer.write( + "" + + NEWLINE + + "" + + NEWLINE); + writer.write(formattedFragment); + writer.write(NEWLINE + ""); + writer.flush(); + writer.close(); + } + + private static BufferedWriter getFragmentWriter(String filename) + throws IOException { + File fragmentFolder = + new File(destinationFolder + File.separator); + if (!fragmentFolder.exists()) { + if (!fragmentFolder.mkdir()) { + throw new IOException(); + } + } + File outputFile = new File(fragmentFolder, filename); + FileOutputStream fileOutputStream = new FileOutputStream(outputFile); + OutputStreamWriter outputStreamWriter = + new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); + return new BufferedWriter(outputStreamWriter); + } + + private static String getFormattedAttribute( + UcdProperty ucdProperty, VALUESOUTPUTTYPE valuesoutputtype) { + String attributeString = " attribute " + ucdProperty.getShortName() + " "; + List values; + StringBuilder stringBuilder = new StringBuilder(); + + switch (ucdProperty) { + case Age: + values = getAgeValues(); + break; + case Block: + values = getBlockValues(); + break; + case General_Category: + values = getGeneralCategoryValues(); + break; + case Canonical_Combining_Class: + values = getCanonicalCombiningClassValues(); + break; + case Bidi_Class: + values = getBidirectionalValues(); + break; + case Bidi_Paired_Bracket_Type: + values = getBidiPairedBracketTypeValues(); + break; + case Decomposition_Type: + values = getDecompositionTypeValues(); + break; + case NFC_Quick_Check: + values = getNFCQuickCheckValues(); + break; + case NFD_Quick_Check: + values = getNFDQuickCheckValues(); + break; + case NFKC_Quick_Check: + values = getNFKCQuickCheckValues(); + break; + case NFKD_Quick_Check: + values = getNFKDQuickCheckValues(); + break; + case Numeric_Type: + values = getNumericTypeValues(); + break; + case Joining_Type: + values = getJoiningTypeValues(); + break; + case Joining_Group: + values = getJoiningGroupValues(); + break; + case Line_Break: + values = getLineBreakValues(); + break; + case East_Asian_Width: + values = getEastAsianWidthValues(); + break; + case Hangul_Syllable_Type: + values = getHangulSyllableTypeValues(); + break; + case Indic_Syllabic_Category: + values = getIndicSyllabicCategoryValues(); + break; + case Indic_Positional_Category: + values = getIndicPositionalCategoryValues(); + break; + case Indic_Conjunct_Break: + values = getIndicConjunctBreakValues(); + break; + case Vertical_Orientation: + values = getVerticalOrientationValues(); + break; + case Grapheme_Cluster_Break: + values = getGraphemeClusterBreakValues(); + break; + case Word_Break: + values = getWordBreakValues(); + break; + case Sentence_Break: + values = getSentenceBreakValues(); + break; + case Do_Not_Emit_Type: + values = getDoNotEmitTypeValues(); + break; + + default: + throw new IllegalStateException( + ucdProperty.getShortName() + + " is not handled by " + + "getFormattedAttribute."); + } + String formattedValues = formatValues(attributeString.length(), values, valuesoutputtype); + stringBuilder + .append(" code-point-attributes &=") + .append(NEWLINE) + .append(attributeString) + .append("{ "); + if (formattedValues.contains(NEWLINE)) { + stringBuilder.append(formattedValues).append(NEWLINE); + stringBuilder.append( + String.format("%" + (attributeString.length() + "}?".length()) + "s", "}?")); + } else { + stringBuilder.append(formattedValues).append(" }?"); + } + return stringBuilder.toString(); + } + + private static String getFormattedSyntax(UcdProperty ucdProperty) { + final PropertyParsingInfo propInfo = PropertyParsingInfo.getPropertyInfo(ucdProperty); + if (propInfo.getRegex() == null) { + throw new NullPointerException( + "Could not find syntax for " + ucdProperty.getShortName()); + } + + String attributeString = + ucdProperty.getShortName().startsWith("cjk") + ? " attribute " + ucdProperty.getShortName().substring(2) + " " + : " attribute " + ucdProperty.getShortName() + " "; + String formattedAttributeString; + switch (ucdProperty) { + // { text } + case ISO_Comment: + formattedAttributeString = attributeString + "{ text }?"; + break; + + // { single-code-point } + case Equivalent_Unified_Ideograph: + formattedAttributeString = attributeString + "{ single-code-point }?"; + break; + + // { "" | single-code-point } + case Bidi_Mirroring_Glyph: + formattedAttributeString = attributeString + "{ \"\" | single-code-point }?"; + break; + + // { "#" | single-code-point } + case Bidi_Paired_Bracket: + case Simple_Uppercase_Mapping: + case Simple_Lowercase_Mapping: + case Simple_Titlecase_Mapping: + case Simple_Case_Folding: + formattedAttributeString = attributeString + "{ \"#\" | single-code-point }?"; + break; + + // { "#" | zero-or-more-code-points } + case Decomposition_Mapping: + case NFKC_Casefold: + case NFKC_Simple_Casefold: + formattedAttributeString = + attributeString + "{ \"#\" | zero-or-more-code-points }?"; + break; + + // { "#" | one-or-more-code-points } + case FC_NFKC_Closure: + case Uppercase_Mapping: + case Lowercase_Mapping: + case Titlecase_Mapping: + case Case_Folding: + formattedAttributeString = attributeString + "{ \"#\" | one-or-more-code-points }?"; + break; + + // { "NaN" | RegEx } + case Numeric_Value: + formattedAttributeString = + attributeString + + "{ \"NaN\" | xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + break; + + // Special cases + case Name: + formattedAttributeString = + attributeString + + "{ \"\" |" + + NEWLINE + + " \"CJK UNIFIED IDEOGRAPH-#\" |" + + NEWLINE + + " \"CJK COMPATIBILITY IDEOGRAPH-#\" |" + + NEWLINE + + " \"EGYPTIAN HIEROGLYPH-#\" |" + + NEWLINE + + " \"TANGUT IDEOGRAPH-#\" |" + + NEWLINE + + " \"KHITAN SMALL SCRIPT CHARACTER-#\" |" + + NEWLINE + + " \"NUSHU CHARACTER-#\" |" + + NEWLINE + + " xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" }" + + NEWLINE + + " }?"; + break; + case Unicode_1_Name: + formattedAttributeString = + attributeString + + "{ \"\" | xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + break; + case Script: + formattedAttributeString = attributeString + "{ script }?"; + break; + case Script_Extensions: + formattedAttributeString = attributeString + "{ list { script + } }?"; + break; + case kTGT_MergedSrc: + // Ideally, should be obtained from a TR. + String kTGT_MergedSrc = + NEWLINE + + " { xsd:string {pattern=\"L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?\"}" + + NEWLINE + + " | xsd:string {pattern=\"L2006-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L1997-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L1986-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"S1968-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"N1966-[0-9]{3}(-[0-9A-Z]{3,4})?\"}" + + NEWLINE + + " | xsd:string {pattern=\"H2004-[A-Z]-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L2012-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"UTN42-[0-9]{3}\"}" + + NEWLINE + + " }?"; + formattedAttributeString = attributeString + kTGT_MergedSrc; + break; + case kReading: + // Ideally, should be obtained from a TR. + String kReading = "{ xsd:string }?"; + formattedAttributeString = attributeString + kReading; + break; + + default: + formattedAttributeString = + attributeString + + "{ xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + } + return " code-point-attributes &=" + NEWLINE + formattedAttributeString; + } + + private static String getFormattedTR38Syntax(UcdProperty ucdProperty) { + // TODO: We should determine whether we still want to show empty values in the XML files. + // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() + boolean isShowIfEmpty = false; + for (UcdPropertyDetail propDetail : UcdPropertyDetail.cjkValues()) { + if (propDetail.getUcdProperty().equals(ucdProperty)) { + isShowIfEmpty = propDetail.isCJKShowIfEmpty(); + } + } + + String attributeString = " attribute " + ucdProperty.getShortName().substring(2); + TR38Details tr38Details = syntaxTR38.get(ucdProperty.name()); + if (tr38Details == null) { + throw new NullPointerException( + "Could not locate details for " + ucdProperty.name() + " in " + TR38URL); + } + String formattedSyntax = formatTR38Syntax(tr38Details, isShowIfEmpty); + + return " code-point-attributes &=" + attributeString + NEWLINE + formattedSyntax; + } + + private static String getFormattedElement(UcdProperty ucdProperty) { + // Currently scoped to UcdProperty.Name_Alias, but might need to handle different + // properties. + String nameAliasElement = "name-alias"; + List values = getNameAliasTypeValues(); + PropertyParsingInfo propInfo = PropertyParsingInfo.getPropertyInfo(ucdProperty); + + String elementString = " element " + nameAliasElement + " {" + NEWLINE; + String attributeAliasString = + " attribute alias { xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?," + + NEWLINE; + String attributeTypeString = " attribute type "; + + String formattedValues = + formatValues( + attributeTypeString.length(), values, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + + return " code-point-attributes &=" + + NEWLINE + + elementString + + attributeAliasString + + attributeTypeString + + "{ " + + formattedValues + + NEWLINE + + String.format( + "%" + (attributeTypeString.length() + "}? } *".length()) + "s", "}? } *"); + } + + private static String getFormattedBoolean(UcdProperty ucdProperty) { + String attributeString = " attribute " + ucdProperty.getShortName() + " "; + + return " code-point-attributes &=" + NEWLINE + attributeString + "{ boolean }?"; + } + + private static String getFormattedValues(SCHEMA schema, VALUESOUTPUTTYPE valuesoutputtype) { + List values = getBinaryValues(); + String formattedValues = formatValues(2, values, valuesoutputtype); + return " " + schema.getName() + " = " + formattedValues; + } + + private static String getFormattedPropertyValues( + UcdProperty ucdProperty, VALUESOUTPUTTYPE valuesoutputtype) { + List values = getScriptValues(); + String formattedValues = formatValues(11, values, valuesoutputtype); + return " " + ucdProperty.name().toLowerCase() + " = " + formattedValues; + } + + private static String getFormattedDoNotEmit(VALUESOUTPUTTYPE valuesoutputtype) { + List values = getDoNotEmitTypeValues(); + String formattedValues = formatValues(26, values, valuesoutputtype); + return " ucd.content &=\n" + + " element do-not-emit {\n" + + " element instead {\n" + + " attribute of { one-or-more-code-points },\n" + + " attribute use { one-or-more-code-points },\n" + + " attribute because { " + + formattedValues + + NEWLINE + + " } }+ }?"; + } + + private static String formatTR38Syntax(TR38Details tr38Details, boolean isShowIfEmpty) { + // TODO: We should determine whether we still want to show empty values in the XML files. + // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() + boolean isList = tr38Details.isList(); + String syntax = cleanRegex(tr38Details.getSyntax()); + // This is a kludge as it depends on only having single OR double quotes in the syntax. If + // we have both, we'll + // need to do more investigation on what RELAXNG Compact supports. + String QUOTMARK = syntax.contains("\"") ? "'" : "\""; + + boolean hasNewlines = syntax.contains("\n"); + if (hasNewlines) { + int indent; + String firstLinePrefix; + String ending = isList ? " )+}}?" : " }?"; + if (isShowIfEmpty) { + indent = (isList ? 15 : 8); + firstLinePrefix = isList ? " { \"\" | list { " : " { \"\" | "; + } else { + indent = (isList ? 12 : 4); + firstLinePrefix = isList ? " { list { ( " : " { "; + } + String padding = String.format("%" + indent + "s", ""); + StringBuilder formattedSyntaxBuilder = new StringBuilder(); + Pattern syntaxPattern = Pattern.compile("([^\r\n]+)"); + Matcher matcher = syntaxPattern.matcher(syntax); + while (matcher.find()) { + if (formattedSyntaxBuilder.length() == 0) { + // First line + formattedSyntaxBuilder + .append(firstLinePrefix) + .append("xsd:string { pattern=") + .append(QUOTMARK) + .append(matcher.group(1)) + .append(QUOTMARK) + .append(" }") + .append(NEWLINE); + } else { + // Everything else + formattedSyntaxBuilder + .append(padding) + .append( + matcher.group(1) + .replaceAll( + "^[| ]*", + " | xsd:string { pattern=" + QUOTMARK)) + .append(QUOTMARK) + .append(" }") + .append(NEWLINE); + } + } + formattedSyntaxBuilder.append(ending); + return formattedSyntaxBuilder.toString(); + + } else { + if (isShowIfEmpty) { + if (isList) { + return " { \"\" | list { xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " }+ } }?"; + } else { + return " { \"\" | xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " } }?"; + } + } else { + if (isList) { + return " { list { xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " }+ } }?"; + } else { + return " { xsd:string { pattern=" + QUOTMARK + syntax + QUOTMARK + " } }?"; + } + } + } + } + + private static String formatValues( + int indent, List values, VALUESOUTPUTTYPE valuesoutputtype) { + StringBuilder valueBlock = new StringBuilder(); + StringBuilder currentLine = new StringBuilder(); + String padding = String.format("%" + indent + "s", ""); + String groupPrefix = ""; + for (String value : values) { + StringBuilder formattedValue = new StringBuilder(); + if (valueBlock.length() > 0 || currentLine.length() > 0) { + formattedValue.append("| "); + } + if (value.startsWith("xsd")) { + formattedValue.append(value); + } else { + formattedValue.append("\"").append(value).append("\""); + } + + switch (valuesoutputtype) { + case NUMERICAL_GROUP: + case ALPHABETICAL_GROUP: + String valuePrefix = getValuePrefix(value, valuesoutputtype); + if (groupPrefix.isEmpty()) { + currentLine.append(formattedValue); + groupPrefix = valuePrefix; + } else if (valuePrefix.equals(groupPrefix)) { + int testLength = + valueBlock.length() == 0 + ? padding.length() + currentLine.length() + " ".length() + : currentLine.length() + " ".length(); + if ((testLength + formattedValue.length()) > MAX_LINE_LENGTH) { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + } else { + if (currentLine.length() > 0) { + currentLine.append(" "); + } + currentLine.append(formattedValue); + } + } else { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + groupPrefix = valuePrefix; + } + break; + + case MAX_LINE_LENGTH: + int testLength = + valueBlock.length() == 0 + ? padding.length() + currentLine.length() + " ".length() + : currentLine.length() + " ".length(); + if ((testLength + formattedValue.length()) > MAX_LINE_LENGTH) { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + } else { + if (currentLine.length() > 0) { + currentLine.append(" "); + } + currentLine.append(formattedValue); + } + break; + + case VALUE_PER_LINE: + default: + if (valueBlock.length() > 0) { + valueBlock.append(NEWLINE).append(padding).append("| "); + } + if (value.startsWith("xsd")) { + valueBlock.append(value); + } else { + valueBlock.append("\"").append(value).append("\""); + } + } + } + valueBlock.append(currentLine); + return valueBlock.toString(); + } + + private static String getValuePrefix(String value, VALUESOUTPUTTYPE valuesoutputtype) { + if (valuesoutputtype == VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) { + return value.substring(0, 1); + } + if (valuesoutputtype == VALUESOUTPUTTYPE.NUMERICAL_GROUP) { + if (value.contains(".")) { + return value.substring(0, value.indexOf(".")); + } else { + // String value in list of numbers. See Age_Values for an example. + return value; + } + } else { + throw new IllegalArgumentException(); + } + } + + private static String cleanRegex(String regex) { + return regex.replaceAll("\\[-", "[\\\\-").replaceAll("\\\\/", "/").replaceAll("\\\\'", "'"); + } + + // ********************* Combined properties ********************// + + private static String getFormattedDecompositionProperties() { + return getFormattedAttribute( + UcdProperty.Decomposition_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Decomposition_Mapping); + } + + private static String getFormattedCompositionProperties() { + return getFormattedBoolean(UcdProperty.Composition_Exclusion) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Full_Composition_Exclusion); + } + + private static String getFormattedQuickCheckProperties() { + return getFormattedAttribute(UcdProperty.NFC_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFD_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFKC_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFKD_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + TRIPLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFC) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFD) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFKC) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFKD) + + TRIPLELINE + + getFormattedSyntax(UcdProperty.FC_NFKC_Closure); + } + + private static String getFormattedNumericProperties() { + return getFormattedAttribute(UcdProperty.Numeric_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Numeric_Value); + } + + private static String getFormattedJoiningProperties() { + return getFormattedAttribute(UcdProperty.Joining_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Joining_Group, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + } + + private static String getFormattedCasingProperties() { + return getFormattedBoolean(UcdProperty.Uppercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Lowercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Uppercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Lowercase); + } + + private static String getFormattedSimpleCaseMappingProperties() { + return getFormattedSyntax(UcdProperty.Simple_Uppercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Simple_Lowercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Simple_Titlecase_Mapping); + } + + private static String getFormattedCaseMappingProperties() { + return getFormattedSyntax(UcdProperty.Uppercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Lowercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Titlecase_Mapping); + } + + private static String getFormattedCaseFoldingProperties() { + return getFormattedSyntax(UcdProperty.Simple_Case_Folding) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Case_Folding); + } + + private static String getFormattedCaseOtherProperties() { + return getFormattedBoolean(UcdProperty.Case_Ignorable) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Cased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Casefolded) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Casemapped) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Lowercased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_NFKC_Casefolded) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Titlecased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Uppercased) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.NFKC_Casefold) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.NFKC_Simple_Casefold); + } + + private static String getFormattedScriptProperties() { + return getFormattedPropertyValues(UcdProperty.Script, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Script) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Script_Extensions); + } + + private static String getFormattedIdentifierProperties() { + return getFormattedBoolean(UcdProperty.ID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_ID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.XID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_ID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.XID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Compat_Math_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Compat_Math_Continue); + } + + private static String getFormattedPatternProperties() { + return getFormattedBoolean(UcdProperty.Pattern_Syntax) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Pattern_White_Space); + } + + private static String getFormattedFunctionGraphicProperties() { + return getFormattedBoolean(UcdProperty.Dash) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Hyphen) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Quotation_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Terminal_Punctuation) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Sentence_Terminal) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Diacritic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Extender) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Soft_Dotted) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Alphabetic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Alphabetic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Math) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Math) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Hex_Digit) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ASCII_Hex_Digit) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Default_Ignorable_Code_Point) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Default_Ignorable_Code_Point) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Logical_Order_Exception) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Prepended_Concatenation_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Modifier_Combining_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.White_Space) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Vertical_Orientation, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Regional_Indicator); + } + + private static String getFormattedBoundaryProperties() { + return getFormattedBoolean(UcdProperty.Grapheme_Base) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Grapheme_Extend) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Grapheme_Extend) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Grapheme_Link) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Grapheme_Cluster_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedAttribute(UcdProperty.Word_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Sentence_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + } + + private static String getFormattedIdeographProperties() { + return getFormattedBoolean(UcdProperty.Ideographic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Unified_Ideograph) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Equivalent_Unified_Ideograph) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Binary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Trinary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Unary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Radical); + } + + private static String getFormattedMiscellaneousProperties() { + return getFormattedBoolean(UcdProperty.Deprecated) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Variation_Selector) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Noncharacter_Code_Point); + } + + private static String getFormattedUnihanProperties() { + return getFormattedTR38Syntax(UcdProperty.kAccountingNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kAlternateTotalStrokes) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kBigFive) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCangjie) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCantonese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCCCII) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCheungBauer) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCheungBauerIndex) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCihaiT) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCNS1986) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCNS1992) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCompatibilityVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCowles) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kDaeJaweon) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kDefinition) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kEACC) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFanqie) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFenn) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFennIndex) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFourCornerCode) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB0) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB3) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB5) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB7) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB8) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGradeLevel) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGSR) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHangul) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanYu) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanyuPinlu) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanyuPinyin) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHDZRadBreak) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHKGlyph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIBMJapan) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIICore) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_GSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_HSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_JSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_KPSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_KSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_MSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_SSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_TSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_UKSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_USource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_VSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGDaeJaweon) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGHanyuDaZidian) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGKangXi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJa) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapanese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapaneseKun) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapaneseOn) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJinmeiyoKanji) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJis0) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJis1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJIS0213) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJoyoKanji) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKangXi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKarlgren) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKorean) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKoreanEducationHanja) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKoreanName) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kLau) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMainlandTelegraph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMandarin) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMatthews) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMeyerWempe) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMojiJoho) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMorohashi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kNelson) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kOtherNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPhonetic) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPrimaryNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPseudoGB1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kRSAdobe_Japan1_6) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kRSUnicode) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSBGY) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSemanticVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSimplifiedVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSMSZD2003Index) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSMSZD2003Readings) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSpecializedSemanticVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSpoofingVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kStrange) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTaiwanTelegraph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTang) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTGH) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTGHZ2013) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTotalStrokes) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTraditionalVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kUnihanCore2020) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kVietnamese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kVietnameseNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kXerox) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kXHC1983) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZhuang) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZhuangNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZVariant); + } + + private static String getFormattedTangutProperties() { + return getFormattedSyntax(UcdProperty.kRSTUnicode) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.kTGT_MergedSrc); + } + + private static String getFormattedNushuProperties() { + return getFormattedSyntax(UcdProperty.kSrc_NushuDuben) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.kReading); + } + + private static String getFormattedEmojiProperties() { + return getFormattedBoolean(UcdProperty.Emoji) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Presentation) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Modifier) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Modifier_Base) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Component) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Extended_Pictographic); + } + + // ********************* Attribute values ********************// + + private static List getBinaryValues() { + List values = new ArrayList<>(); + for (Binary binaryValues : Binary.values()) { + values.add(binaryValues.getShortName()); + } + // Binary should display as Y | N. + values.sort(Collections.reverseOrder()); + return values; + } + + private static List getAgeValues() { + List values = new ArrayList<>(); + for (Age_Values ageValues : Age_Values.values()) { + String shortName = ageValues.getShortName(); + if (shortName.equals("NA")) { + values.add("unassigned"); + } else if (shortName.equals("13.1")) { + // https://github.com/unicode-org/unicodetools/issues/100 + } else { + values.add(shortName); + } + } + return values; + } + + private static List getNameAliasTypeValues() { + List values = new ArrayList<>(); + for (AttributeResolver.AliasType aliastypeValues : AttributeResolver.AliasType.values()) { + if (!aliastypeValues.equals(AttributeResolver.AliasType.NONE)) { + values.add(aliastypeValues.toString()); + } + } + return values; + } + + private static List getBlockValues() { + List values = new ArrayList<>(); + for (Block_Values blockValues : Block_Values.values()) { + values.add(blockValues.getShortName()); + } + return values; + } + + private static List getGeneralCategoryValues() { + List values = new ArrayList<>(); + for (General_Category_Values generalCategoryValues : General_Category_Values.values()) { + if (!generalCategoryValues + .getShortName() + .toUpperCase() + .equals(generalCategoryValues.getShortName())) { + // Some of the General_Category_Values (LC, L, M, N, P, S, Z, C) stand for grouping + // of related + // General_Category values. They won't occur on any individual code point, so can be + // ignored. + values.add(generalCategoryValues.getShortName()); + } + } + return values; + } + + private static List getCanonicalCombiningClassValues() { + List values = new ArrayList<>(); + values.add("xsd:integer { minInclusive=\"0\" maxInclusive=\"254\" }"); + // Because the set of values that this property has taken across the various versions of the + // UCD is rather + // large, our schema does not restrict the possible values to those actually used. + // for (Canonical_Combining_Class_Values canonicalCombiningClassValues : + // Canonical_Combining_Class_Values.values()) { + // values.add(canonicalCombiningClassValues.getShortName()); + // } + return values; + } + + private static List getBidirectionalValues() { + List values = new ArrayList<>(); + for (Bidi_Class_Values bidiClassValues : Bidi_Class_Values.values()) { + values.add(bidiClassValues.getShortName()); + } + return values; + } + + private static List getBidiPairedBracketTypeValues() { + List values = new ArrayList<>(); + // Order should be Open/Close/None + values.add(Bidi_Paired_Bracket_Type_Values.Open.getShortName()); + values.add(Bidi_Paired_Bracket_Type_Values.Close.getShortName()); + values.add(Bidi_Paired_Bracket_Type_Values.None.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (Bidi_Paired_Bracket_Type_Values bidiPairedBracketTypeValue : + Bidi_Paired_Bracket_Type_Values.values()) { + if (!values.contains(bidiPairedBracketTypeValue.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getDecompositionTypeValues() { + List values = new ArrayList<>(); + for (Decomposition_Type_Values decompositionTypeValues : + Decomposition_Type_Values.values()) { + // We want "none" to be last. + if (decompositionTypeValues != Decomposition_Type_Values.None) { + values.add(decompositionTypeValues.getNames().getOtherNames().get(0)); + } + } + values.add(Decomposition_Type_Values.None.getNames().getOtherNames().get(0)); + return values; + } + + private static List getNFCQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No/Maybe + values.add(NFC_Quick_Check_Values.Yes.getShortName()); + values.add(NFC_Quick_Check_Values.No.getShortName()); + values.add(NFC_Quick_Check_Values.Maybe.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFC_Quick_Check_Values nfcQuickCheckValues : NFC_Quick_Check_Values.values()) { + if (!values.contains(nfcQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFDQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No + values.add(NFD_Quick_Check_Values.Yes.getShortName()); + values.add(NFD_Quick_Check_Values.No.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFD_Quick_Check_Values nfdQuickCheckValues : NFD_Quick_Check_Values.values()) { + if (!values.contains(nfdQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFKCQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No/Maybe + values.add(NFKC_Quick_Check_Values.Yes.getShortName()); + values.add(NFKC_Quick_Check_Values.No.getShortName()); + values.add(NFKC_Quick_Check_Values.Maybe.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFKC_Quick_Check_Values nfkcQuickCheckValues : NFKC_Quick_Check_Values.values()) { + if (!values.contains(nfkcQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFKDQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No + values.add(NFKD_Quick_Check_Values.Yes.getShortName()); + values.add(NFKD_Quick_Check_Values.No.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFKD_Quick_Check_Values nfkdQuickCheckValues : NFKD_Quick_Check_Values.values()) { + if (!values.contains(nfkdQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNumericTypeValues() { + List values = new ArrayList<>(); + // Order should be Decimal/Digit/Numeric/None + values.add(Numeric_Type_Values.Decimal.getShortName()); + values.add(Numeric_Type_Values.Digit.getShortName()); + values.add(Numeric_Type_Values.Numeric.getShortName()); + values.add(Numeric_Type_Values.None.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (Numeric_Type_Values numericTypeValues : Numeric_Type_Values.values()) { + if (!values.contains(numericTypeValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getJoiningTypeValues() { + List values = new ArrayList<>(); + for (Joining_Type_Values joiningTypeValues : Joining_Type_Values.values()) { + values.add(joiningTypeValues.getShortName()); + } + return values; + } + + private static List getJoiningGroupValues() { + List values = new ArrayList<>(); + for (Joining_Group_Values joiningGroupValues : Joining_Group_Values.values()) { + values.add(joiningGroupValues.getShortName()); + } + return values; + } + + private static List getLineBreakValues() { + List values = new ArrayList<>(); + for (Line_Break_Values lineBreakValues : Line_Break_Values.values()) { + values.add(lineBreakValues.getShortName()); + } + return values; + } + + private static List getEastAsianWidthValues() { + List values = new ArrayList<>(); + for (East_Asian_Width_Values eastAsianWidthValues : East_Asian_Width_Values.values()) { + values.add(eastAsianWidthValues.getShortName()); + } + return values; + } + + private static List getScriptValues() { + List excludedValues = + Arrays.asList( + Script_Values.Han_with_Bopomofo, + Script_Values.Japanese, + Script_Values.Korean, + Script_Values.Math_Symbols, + Script_Values.Emoji_Symbols, + Script_Values.Other_Symbols, + Script_Values.Unwritten); + List values = new ArrayList<>(); + for (Script_Values scriptValue : Script_Values.values()) { + if (!excludedValues.contains(scriptValue)) { + values.add(scriptValue.getShortName()); + } + // Include the following if you want to add other names + // if (!scriptValue.getNames().getOtherNames().isEmpty()) { + // values.add(scriptValue.getNames().getOtherNames().get(0)); + // } + } + Collections.sort(values); + return values; + } + + private static List getHangulSyllableTypeValues() { + List values = new ArrayList<>(); + for (Hangul_Syllable_Type_Values hangulSyllableTypeValues : + Hangul_Syllable_Type_Values.values()) { + values.add(hangulSyllableTypeValues.getShortName()); + } + return values; + } + + private static List getIndicSyllabicCategoryValues() { + List values = new ArrayList<>(); + for (Indic_Syllabic_Category_Values indicSyllabicCategoryValues : + Indic_Syllabic_Category_Values.values()) { + values.add(indicSyllabicCategoryValues.getShortName()); + } + return values; + } + + private static List getIndicPositionalCategoryValues() { + List values = new ArrayList<>(); + for (Indic_Positional_Category_Values indicPositionalCategoryValues : + Indic_Positional_Category_Values.values()) { + values.add(indicPositionalCategoryValues.getShortName()); + } + return values; + } + + private static List getIndicConjunctBreakValues() { + List values = new ArrayList<>(); + for (Indic_Conjunct_Break_Values indicConjunctBreakValues : + Indic_Conjunct_Break_Values.values()) { + values.add(indicConjunctBreakValues.getShortName()); + } + return values; + } + + private static List getVerticalOrientationValues() { + List values = new ArrayList<>(); + for (Vertical_Orientation_Values verticalOrientationValues : + Vertical_Orientation_Values.values()) { + values.add(verticalOrientationValues.getShortName()); + } + return values; + } + + private static List getGraphemeClusterBreakValues() { + List values = new ArrayList<>(); + for (Grapheme_Cluster_Break_Values graphemeClusterBreakValues : + Grapheme_Cluster_Break_Values.values()) { + values.add(graphemeClusterBreakValues.getShortName()); + } + return values; + } + + private static List getWordBreakValues() { + List values = new ArrayList<>(); + for (Word_Break_Values wordBreakValues : Word_Break_Values.values()) { + values.add(wordBreakValues.getShortName()); + } + return values; + } + + private static List getSentenceBreakValues() { + List values = new ArrayList<>(); + for (Sentence_Break_Values sentenceBreakValues : Sentence_Break_Values.values()) { + values.add(sentenceBreakValues.getShortName()); + } + return values; + } + + private static List getDoNotEmitTypeValues() { + List values = new ArrayList<>(); + for (Do_Not_Emit_Type_Values doNotEmitTypeValues : Do_Not_Emit_Type_Values.values()) { + values.add(doNotEmitTypeValues.getShortName()); + } + Collections.sort(values); + return values; + } + + // ********************* Utility methods ********************// + + private static HashMap parseTR38() throws IOException, URISyntaxException { + HashMap syntaxTR38 = new HashMap<>(); + URI uri = new URI(TR38URL); + StringBuilder stringBuilder = new StringBuilder(); + try (InputStream is = uri.toURL().openStream()) { + int ptr = 0; + while ((ptr = is.read()) != -1) { + stringBuilder.append((char) ptr); + } + } + Pattern syntaxPattern = + Pattern.compile( + ">Property.*?(.*?).*?>Delimiter.*?>(.*?).*?>Syntax.*?>(.*?)", + Pattern.DOTALL); + Matcher matcher = syntaxPattern.matcher(stringBuilder.toString()); + while (matcher.find()) { + String delimiter = matcher.group(2).trim(); + boolean isList = false; + switch (delimiter) { + case "N/A": + break; + case "space": + isList = true; + break; + default: + throw new IllegalArgumentException( + "Only \"space\" or \"N/A\" are supported values for Delimiter." + + " Found: " + + delimiter); + } + TR38Details tr38Details = + new TR38Details(isList, matcher.group(3).trim().replaceAll("
    ", "")); + syntaxTR38.put(matcher.group(1).trim(), tr38Details); + } + return syntaxTR38; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java new file mode 100644 index 0000000000..a30067bbb6 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java @@ -0,0 +1,210 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import java.util.*; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdLineParser; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class UCDDataResolver { + + private final IndexUnicodeProperties indexUnicodeProperties; + private final String namespace; + private final UCDXMLWriter writer; + + public UCDDataResolver(IndexUnicodeProperties iup, String namespace, UCDXMLWriter writer) { + indexUnicodeProperties = iup; + this.namespace = namespace; + this.writer = writer; + } + + public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXException { + VersionInfo minVersion = ucdSection.getMinVersion(); + VersionInfo maxVersion = ucdSection.getMaxVersion(); + String tag = ucdSection.toString(); + String childTag = ucdSection.getChildTag(); + boolean parserWithRange = ucdSection.getParserWithRange(); + boolean parserWithMissing = ucdSection.getParserWithMissing(); + UcdSectionComponent[] ucdSectionComponents = + ucdSection.getUcdSectionDetail().getUcdSectionComponents(); + + if (isCompatibleVersion(minVersion, maxVersion)) { + writer.startElement(tag); + { + for (UcdSectionComponent ucdSectionComponent : ucdSectionComponents) { + if (isCompatibleVersion( + ucdSectionComponent.getMinVersion(), + ucdSectionComponent.getMaxVersion())) { + final PropertyParsingInfo fileInfoEVS = + PropertyParsingInfo.getPropertyInfo( + ucdSectionComponent.getUcdProperty()); + String fullFilename = + fileInfoEVS.getFullFileName(indexUnicodeProperties.getUcdVersion()); + UcdLineParser parser = + new UcdLineParser(FileUtilities.in("", fullFilename)); + parser.withRange(parserWithRange); + parser.withMissing(parserWithMissing); + switch (ucdSection) { + case BLOCKS: + for (UcdLineParser.UcdLine line : parser) { + if (!line.getOriginalLine().startsWith("#")) { + AttributesImpl attributes = + getBlockAttributes(namespace, line); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + } + break; + case NAMEDSEQUENCES: + HashMap namedSequences = new HashMap<>(); + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + namedSequences.put(parts[0], parts[1]); + } + List names = new ArrayList<>(namedSequences.keySet()); + Collections.sort(names); + for (String name : names) { + AttributesImpl attributes = + getNamedSequenceAttributes( + namespace, name, namedSequences); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + break; + case PROVISIONALNAMEDSEQUENCES: + HashMap provisionalNamedSequences = new HashMap<>(); + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + provisionalNamedSequences.put(parts[0], parts[1]); + } + List psNames = + new ArrayList<>(provisionalNamedSequences.keySet()); + Collections.sort(psNames); + for (String name : psNames) { + AttributesImpl attributes = + getNamedSequenceAttributes( + namespace, name, provisionalNamedSequences); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + break; + default: + for (UcdLineParser.UcdLine line : parser) { + AttributesImpl attributes = + getAttributes(ucdSection, namespace, line); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + } + } + } + writer.endElement(tag); + } + } + } + + private AttributesImpl getAttributes( + UcdSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { + switch (ucdSection) { + case CJKRADICALS: + return getCJKRadicalAttributes(namespace, line); + case DONOTEMIT: + return getDoNotEmitAttributes(namespace, line); + case EMOJISOURCES: + return getEmojiSourceAttributes(namespace, line); + case NORMALIZATIONCORRECTIONS: + return getNCAttributes(namespace, line); + case STANDARDIZEDVARIANTS: + return getSVAttributes(namespace, line); + default: + throw new IllegalArgumentException( + "getAttributes failed on an unexpected UcdSection"); + } + } + + private static AttributesImpl getBlockAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + String[] range = parts[0].split("\\.\\."); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "first-cp", "first-cp", "CDATA", range[0]); + attributes.addAttribute(namespace, "last-cp", "last-cp", "CDATA", range[1]); + attributes.addAttribute(namespace, "name", "name", "CDATA", parts[1]); + return attributes; + } + + private static AttributesImpl getCJKRadicalAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "number", "number", "CDATA", parts[0]); + attributes.addAttribute(namespace, "radical", "radical", "CDATA", parts[1]); + attributes.addAttribute(namespace, "ideograph", "ideograph", "CDATA", parts[2]); + return attributes; + } + + private static AttributesImpl getDoNotEmitAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "of", "of", "CDATA", parts[0]); + attributes.addAttribute(namespace, "use", "use", "CDATA", parts[1]); + attributes.addAttribute(namespace, "because", "because", "CDATA", parts[2]); + return attributes; + } + + private static AttributesImpl getEmojiSourceAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "unicode", "unicode", "CDATA", parts[0]); + attributes.addAttribute(namespace, "docomo", "docomo", "CDATA", parts[1]); + attributes.addAttribute(namespace, "kddi", "kddi", "CDATA", parts[2]); + attributes.addAttribute(namespace, "softbank", "softbank", "CDATA", parts[3]); + return attributes; + } + + private static AttributesImpl getNamedSequenceAttributes( + String namespace, String name, HashMap namedSequences) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "name", "name", "CDATA", name); + attributes.addAttribute(namespace, "cps", "cps", "CDATA", namedSequences.get(name)); + return attributes; + } + + private static AttributesImpl getNCAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "cp", "cp", "CDATA", parts[0]); + attributes.addAttribute(namespace, "old", "old", "CDATA", parts[1]); + attributes.addAttribute(namespace, "new", "new", "CDATA", parts[2]); + attributes.addAttribute(namespace, "version", "version", "CDATA", parts[3]); + return attributes; + } + + private static AttributesImpl getSVAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "cps", "cps", "CDATA", parts[0]); + attributes.addAttribute(namespace, "desc", "desc", "CDATA", parts[1]); + attributes.addAttribute( + namespace, "when", "when", "CDATA", parts[2] != null ? parts[2] : ""); + return attributes; + } + + private boolean isCompatibleVersion(VersionInfo minVersion, VersionInfo maxVersion) { + return (indexUnicodeProperties.getUcdVersion().compareTo(minVersion) >= 0 + && (maxVersion == null + || indexUnicodeProperties.getUcdVersion().compareTo(maxVersion) <= 0)); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java new file mode 100644 index 0000000000..ff31e69c61 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java @@ -0,0 +1,74 @@ +package org.unicode.xml; + +import java.io.FileOutputStream; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class UCDXMLWriter { + + public static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; + + private final TransformerHandler transformerHandler; + + public TransformerHandler getTransformerHandler() { + return transformerHandler; + } + + public UCDXMLWriter(FileOutputStream f) throws TransformerConfigurationException { + TransformerFactory tfactory = TransformerFactory.newInstance(); + SAXTransformerFactory sfactory = (SAXTransformerFactory) tfactory; + transformerHandler = sfactory.newTransformerHandler(); + Transformer transformer = transformerHandler.getTransformer(); + transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); + transformer.setOutputProperty(OutputKeys.METHOD, "xml"); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty(OutputKeys.STANDALONE, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "3"); + transformerHandler.setResult(new StreamResult(f)); + } + + public void startFile() throws SAXException { + transformerHandler.startDocument(); + char[] c = "\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + // TODO: JRW change hardcoded 2023 to current year. + c = " \u00A9 2023 Unicode\u00AE, Inc. ".toCharArray(); + transformerHandler.comment(c, 0, c.length); + c = "\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + c = " For terms of use, see http://www.unicode.org/terms_of_use.html ".toCharArray(); + transformerHandler.comment(c, 0, c.length); + c = "\n\n\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + } + + public void endFile() throws SAXException { + transformerHandler.endDocument(); + } + + public void startElement(String tagName) throws SAXException { + AttributesImpl attributes = new AttributesImpl(); + startElement(tagName, attributes); + } + + public void startElement(String tagName, AttributesImpl attributes) throws SAXException { + transformerHandler.startElement(NAMESPACE, tagName, tagName, attributes); + } + + public void addContent(String s) throws SAXException { + char[] d = s.toCharArray(); + transformerHandler.characters(d, 0, d.length); + } + + public void endElement(String tagName) throws SAXException { + transformerHandler.endElement(NAMESPACE, tagName, tagName); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java b/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java new file mode 100644 index 0000000000..a97ef5bab9 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java @@ -0,0 +1,2356 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import java.util.LinkedHashSet; +import java.util.Set; +import org.unicode.props.UcdProperty; + +public class UcdPropertyDetail { + + private static LinkedHashSet basePropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet cjkPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet ucdxmlPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet allPropertyDetails = + new LinkedHashSet(); + + public static UcdPropertyDetail Age_Detail = + new UcdPropertyDetail( + UcdProperty.Age, VersionInfo.getInstance(3, 2, 0), 1, true, false, false, true); + public static UcdPropertyDetail Name_Detail = + new UcdPropertyDetail( + UcdProperty.Name, + VersionInfo.getInstance(1, 1, 0), + 2, + true, + false, + false, + true); + public static UcdPropertyDetail Jamo_Short_Name_Detail = + new UcdPropertyDetail( + UcdProperty.Jamo_Short_Name, + VersionInfo.getInstance(5, 1, 0), + 3, + true, + false, + false, + true); + public static UcdPropertyDetail General_Category_Detail = + new UcdPropertyDetail( + UcdProperty.General_Category, + VersionInfo.getInstance(1, 1, 0), + 4, + true, + false, + false, + true); + public static UcdPropertyDetail Canonical_Combining_Class_Detail = + new UcdPropertyDetail( + UcdProperty.Canonical_Combining_Class, + VersionInfo.getInstance(1, 1, 0), + 5, + true, + false, + false, + true); + public static UcdPropertyDetail Decomposition_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Decomposition_Type, + VersionInfo.getInstance(1, 1, 0), + 6, + true, + false, + false, + true); + public static UcdPropertyDetail Decomposition_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Decomposition_Mapping, + VersionInfo.getInstance(1, 1, 0), + 7, + true, + false, + false, + true); + public static UcdPropertyDetail Numeric_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Numeric_Type, + VersionInfo.getInstance(1, 1, 0), + 8, + true, + false, + false, + true); + public static UcdPropertyDetail Numeric_Value_Detail = + new UcdPropertyDetail( + UcdProperty.Numeric_Value, + VersionInfo.getInstance(1, 1, 0), + 9, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Class_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Class, + VersionInfo.getInstance(1, 1, 0), + 10, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Paired_Bracket_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Paired_Bracket_Type, + VersionInfo.getInstance(6, 3, 0), + 11, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Paired_Bracket_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Paired_Bracket, + VersionInfo.getInstance(6, 3, 0), + 12, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Mirrored_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Mirrored, + VersionInfo.getInstance(1, 1, 0), + 13, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Mirroring_Glyph_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Mirroring_Glyph, + VersionInfo.getInstance(3, 0, 1), + 14, + true, + false, + false, + true); + public static UcdPropertyDetail Simple_Uppercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Uppercase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 15, + true, + false, + false, + true); + public static UcdPropertyDetail Simple_Lowercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Lowercase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 16, + true, + false, + false, + true); + public static UcdPropertyDetail Simple_Titlecase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Titlecase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 17, + true, + false, + false, + true); + public static UcdPropertyDetail Uppercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Uppercase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 18, + true, + false, + false, + true); + public static UcdPropertyDetail Lowercase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Lowercase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 19, + true, + false, + false, + true); + public static UcdPropertyDetail Titlecase_Mapping_Detail = + new UcdPropertyDetail( + UcdProperty.Titlecase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 20, + true, + false, + false, + true); + // public static UcdPropertyDetail Special_Case_Condition_Detail = new UcdPropertyDetail + // ( + // UcdProperty.Special_Case_Condition, VersionInfo.getInstance(1,1,0), 21, + // true, false, false, true); + public static UcdPropertyDetail Simple_Case_Folding_Detail = + new UcdPropertyDetail( + UcdProperty.Simple_Case_Folding, + VersionInfo.getInstance(3, 0, 1), + 22, + true, + false, + false, + true); + public static UcdPropertyDetail Case_Folding_Detail = + new UcdPropertyDetail( + UcdProperty.Case_Folding, + VersionInfo.getInstance(3, 0, 1), + 23, + true, + false, + false, + true); + public static UcdPropertyDetail Joining_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Joining_Type, + VersionInfo.getInstance(2, 0, 0), + 24, + true, + false, + false, + true); + public static UcdPropertyDetail Joining_Group_Detail = + new UcdPropertyDetail( + UcdProperty.Joining_Group, + VersionInfo.getInstance(2, 0, 0), + 25, + true, + false, + false, + true); + public static UcdPropertyDetail East_Asian_Width_Detail = + new UcdPropertyDetail( + UcdProperty.East_Asian_Width, + VersionInfo.getInstance(3, 0, 0), + 26, + true, + false, + false, + true); + public static UcdPropertyDetail Line_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Line_Break, + VersionInfo.getInstance(3, 0, 0), + 27, + true, + false, + false, + true); + public static UcdPropertyDetail Script_Detail = + new UcdPropertyDetail( + UcdProperty.Script, + VersionInfo.getInstance(3, 1, 0), + 28, + true, + false, + false, + true); + public static UcdPropertyDetail Script_Extensions_Detail = + new UcdPropertyDetail( + UcdProperty.Script_Extensions, + VersionInfo.getInstance(6, 1, 0), + 29, + true, + false, + false, + true); + public static UcdPropertyDetail Dash_Detail = + new UcdPropertyDetail( + UcdProperty.Dash, + VersionInfo.getInstance(2, 0, 0), + 30, + true, + false, + false, + true); + public static UcdPropertyDetail White_Space_Detail = + new UcdPropertyDetail( + UcdProperty.White_Space, + VersionInfo.getInstance(2, 0, 0), + 31, + true, + false, + false, + true); + public static UcdPropertyDetail Hyphen_Detail = + new UcdPropertyDetail( + UcdProperty.Hyphen, + VersionInfo.getInstance(2, 0, 0), + 32, + true, + false, + false, + true); + public static UcdPropertyDetail Quotation_Mark_Detail = + new UcdPropertyDetail( + UcdProperty.Quotation_Mark, + VersionInfo.getInstance(2, 0, 0), + 33, + true, + false, + false, + true); + public static UcdPropertyDetail Radical_Detail = + new UcdPropertyDetail( + UcdProperty.Radical, + VersionInfo.getInstance(3, 2, 0), + 34, + true, + false, + false, + true); + public static UcdPropertyDetail Ideographic_Detail = + new UcdPropertyDetail( + UcdProperty.Ideographic, + VersionInfo.getInstance(2, 0, 0), + 35, + true, + false, + false, + true); + public static UcdPropertyDetail Unified_Ideograph_Detail = + new UcdPropertyDetail( + UcdProperty.Unified_Ideograph, + VersionInfo.getInstance(3, 2, 0), + 36, + true, + false, + false, + true); + public static UcdPropertyDetail IDS_Binary_Operator_Detail = + new UcdPropertyDetail( + UcdProperty.IDS_Binary_Operator, + VersionInfo.getInstance(3, 2, 0), + 37, + true, + false, + false, + true); + public static UcdPropertyDetail IDS_Trinary_Operator_Detail = + new UcdPropertyDetail( + UcdProperty.IDS_Trinary_Operator, + VersionInfo.getInstance(3, 2, 0), + 38, + true, + false, + false, + true); + public static UcdPropertyDetail Hangul_Syllable_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Hangul_Syllable_Type, + VersionInfo.getInstance(4, 0, 0), + 39, + true, + false, + false, + true); + public static UcdPropertyDetail Default_Ignorable_Code_Point_Detail = + new UcdPropertyDetail( + UcdProperty.Default_Ignorable_Code_Point, + VersionInfo.getInstance(3, 2, 0), + 40, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Default_Ignorable_Code_Point_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Default_Ignorable_Code_Point, + VersionInfo.getInstance(3, 2, 0), + 41, + true, + false, + false, + true); + public static UcdPropertyDetail Alphabetic_Detail = + new UcdPropertyDetail( + UcdProperty.Alphabetic, + VersionInfo.getInstance(1, 1, 0), + 42, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Alphabetic_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Alphabetic, + VersionInfo.getInstance(3, 1, 0), + 43, + true, + false, + false, + true); + public static UcdPropertyDetail Uppercase_Detail = + new UcdPropertyDetail( + UcdProperty.Uppercase, + VersionInfo.getInstance(3, 1, 0), + 44, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Uppercase_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Uppercase, + VersionInfo.getInstance(3, 1, 0), + 45, + true, + false, + false, + true); + public static UcdPropertyDetail Lowercase_Detail = + new UcdPropertyDetail( + UcdProperty.Lowercase, + VersionInfo.getInstance(3, 1, 0), + 46, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Lowercase_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Lowercase, + VersionInfo.getInstance(3, 1, 0), + 47, + true, + false, + false, + true); + public static UcdPropertyDetail Math_Detail = + new UcdPropertyDetail( + UcdProperty.Math, + VersionInfo.getInstance(2, 0, 0), + 48, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Math_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Math, + VersionInfo.getInstance(3, 1, 0), + 49, + true, + false, + false, + true); + public static UcdPropertyDetail Hex_Digit_Detail = + new UcdPropertyDetail( + UcdProperty.Hex_Digit, + VersionInfo.getInstance(2, 0, 0), + 50, + true, + false, + false, + true); + public static UcdPropertyDetail ASCII_Hex_Digit_Detail = + new UcdPropertyDetail( + UcdProperty.ASCII_Hex_Digit, + VersionInfo.getInstance(3, 1, 1), + 51, + true, + false, + false, + true); + public static UcdPropertyDetail Noncharacter_Code_Point_Detail = + new UcdPropertyDetail( + UcdProperty.Noncharacter_Code_Point, + VersionInfo.getInstance(3, 0, 1), + 52, + true, + false, + false, + true); + public static UcdPropertyDetail Variation_Selector_Detail = + new UcdPropertyDetail( + UcdProperty.Variation_Selector, + VersionInfo.getInstance(4, 0, 1), + 53, + true, + false, + false, + true); + public static UcdPropertyDetail Bidi_Control_Detail = + new UcdPropertyDetail( + UcdProperty.Bidi_Control, + VersionInfo.getInstance(2, 0, 0), + 54, + true, + false, + false, + true); + public static UcdPropertyDetail Join_Control_Detail = + new UcdPropertyDetail( + UcdProperty.Join_Control, + VersionInfo.getInstance(2, 0, 0), + 55, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Base_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Base, + VersionInfo.getInstance(3, 2, 0), + 56, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Extend_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Extend, + VersionInfo.getInstance(3, 2, 0), + 57, + true, + false, + false, + true); + public static UcdPropertyDetail Other_Grapheme_Extend_Detail = + new UcdPropertyDetail( + UcdProperty.Other_Grapheme_Extend, + VersionInfo.getInstance(3, 2, 0), + 58, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Link_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Link, + VersionInfo.getInstance(3, 2, 0), + 59, + true, + false, + false, + true); + public static UcdPropertyDetail Sentence_Terminal_Detail = + new UcdPropertyDetail( + UcdProperty.Sentence_Terminal, + VersionInfo.getInstance(9, 0, 0), + 60, + true, + false, + false, + true); + public static UcdPropertyDetail Extender_Detail = + new UcdPropertyDetail( + UcdProperty.Extender, + VersionInfo.getInstance(2, 0, 0), + 61, + true, + false, + false, + true); + public static UcdPropertyDetail Terminal_Punctuation_Detail = + new UcdPropertyDetail( + UcdProperty.Terminal_Punctuation, + VersionInfo.getInstance(2, 0, 0), + 62, + true, + false, + false, + true); + public static UcdPropertyDetail Diacritic_Detail = + new UcdPropertyDetail( + UcdProperty.Diacritic, + VersionInfo.getInstance(2, 0, 0), + 63, + true, + false, + false, + true); + public static UcdPropertyDetail Deprecated_Detail = + new UcdPropertyDetail( + UcdProperty.Deprecated, + VersionInfo.getInstance(3, 2, 0), + 64, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Start_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Start, + VersionInfo.getInstance(3, 1, 0), + 65, + true, + false, + false, + true); + public static UcdPropertyDetail Other_ID_Start_Detail = + new UcdPropertyDetail( + UcdProperty.Other_ID_Start, + VersionInfo.getInstance(4, 0, 0), + 66, + true, + false, + false, + true); + public static UcdPropertyDetail XID_Start_Detail = + new UcdPropertyDetail( + UcdProperty.XID_Start, + VersionInfo.getInstance(3, 1, 0), + 67, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Continue, + VersionInfo.getInstance(3, 1, 0), + 68, + true, + false, + false, + true); + public static UcdPropertyDetail Other_ID_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.Other_ID_Continue, + VersionInfo.getInstance(4, 1, 0), + 69, + true, + false, + false, + true); + public static UcdPropertyDetail XID_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.XID_Continue, + VersionInfo.getInstance(3, 1, 0), + 70, + true, + false, + false, + true); + public static UcdPropertyDetail Soft_Dotted_Detail = + new UcdPropertyDetail( + UcdProperty.Soft_Dotted, + VersionInfo.getInstance(3, 2, 0), + 71, + true, + false, + false, + true); + public static UcdPropertyDetail Logical_Order_Exception_Detail = + new UcdPropertyDetail( + UcdProperty.Logical_Order_Exception, + VersionInfo.getInstance(3, 2, 0), + 72, + true, + false, + false, + true); + public static UcdPropertyDetail Pattern_White_Space_Detail = + new UcdPropertyDetail( + UcdProperty.Pattern_White_Space, + VersionInfo.getInstance(4, 1, 0), + 73, + true, + false, + false, + true); + public static UcdPropertyDetail Pattern_Syntax_Detail = + new UcdPropertyDetail( + UcdProperty.Pattern_Syntax, + VersionInfo.getInstance(4, 1, 0), + 74, + true, + false, + false, + true); + public static UcdPropertyDetail Grapheme_Cluster_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Grapheme_Cluster_Break, + VersionInfo.getInstance(4, 1, 0), + 75, + true, + false, + false, + true); + public static UcdPropertyDetail Word_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Word_Break, + VersionInfo.getInstance(4, 1, 0), + 76, + true, + false, + false, + true); + public static UcdPropertyDetail Sentence_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Sentence_Break, + VersionInfo.getInstance(4, 1, 0), + 77, + true, + false, + false, + true); + public static UcdPropertyDetail Composition_Exclusion_Detail = + new UcdPropertyDetail( + UcdProperty.Composition_Exclusion, + VersionInfo.getInstance(3, 0, 0), + 78, + true, + false, + false, + true); + public static UcdPropertyDetail Full_Composition_Exclusion_Detail = + new UcdPropertyDetail( + UcdProperty.Full_Composition_Exclusion, + VersionInfo.getInstance(3, 1, 0), + 79, + true, + false, + false, + true); + public static UcdPropertyDetail NFC_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFC_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 80, + true, + false, + false, + true); + public static UcdPropertyDetail NFD_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFD_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 81, + true, + false, + false, + true); + public static UcdPropertyDetail NFKC_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFKC_Quick_Check, + VersionInfo.getInstance(5, 2, 0), + 82, + true, + false, + false, + true); + public static UcdPropertyDetail NFKD_Quick_Check_Detail = + new UcdPropertyDetail( + UcdProperty.NFKD_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 83, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFC_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFC, + VersionInfo.getInstance(3, 2, 0), + 84, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFD_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFD, + VersionInfo.getInstance(3, 2, 0), + 85, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFKC_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFKC, + VersionInfo.getInstance(3, 2, 0), + 86, + true, + false, + false, + true); + public static UcdPropertyDetail Expands_On_NFKD_Detail = + new UcdPropertyDetail( + UcdProperty.Expands_On_NFKD, + VersionInfo.getInstance(3, 2, 0), + 87, + true, + false, + false, + true); + public static UcdPropertyDetail FC_NFC_Closure_Detail = + new UcdPropertyDetail( + UcdProperty.FC_NFKC_Closure, + VersionInfo.getInstance(3, 1, 0), + 88, + true, + false, + false, + true); + public static UcdPropertyDetail Case_Ignorable_Detail = + new UcdPropertyDetail( + UcdProperty.Case_Ignorable, + VersionInfo.getInstance(5, 2, 0), + 89, + true, + false, + false, + true); + public static UcdPropertyDetail Cased_Detail = + new UcdPropertyDetail( + UcdProperty.Cased, + VersionInfo.getInstance(5, 2, 0), + 90, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_CaseFolded_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Casefolded, + VersionInfo.getInstance(5, 2, 0), + 91, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_CaseMapped_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Casemapped, + VersionInfo.getInstance(5, 2, 0), + 92, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_NFKC_Casefolded_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_NFKC_Casefolded, + VersionInfo.getInstance(5, 2, 0), + 93, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_Lowercased_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Lowercased, + VersionInfo.getInstance(5, 2, 0), + 94, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_Titlecased_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Titlecased, + VersionInfo.getInstance(5, 2, 0), + 95, + true, + false, + false, + true); + public static UcdPropertyDetail Changes_When_Uppercased_Detail = + new UcdPropertyDetail( + UcdProperty.Changes_When_Uppercased, + VersionInfo.getInstance(5, 2, 0), + 96, + true, + false, + false, + true); + public static UcdPropertyDetail NFKC_Casefold_Detail = + new UcdPropertyDetail( + UcdProperty.NFKC_Casefold, + VersionInfo.getInstance(5, 2, 0), + 97, + true, + false, + false, + true); + public static UcdPropertyDetail Indic_Syllabic_Category_Detail = + new UcdPropertyDetail( + UcdProperty.Indic_Syllabic_Category, + VersionInfo.getInstance(6, 1, 0), + 98, + true, + false, + false, + true); + // public static UcdPropertyDetail Indic_Matra_Category_Detail = new UcdPropertyDetail ( + // UcdProperty.Indic_Matra_Category, VersionInfo.getInstance(6,1,0), + // VersionInfo.getInstance(7,0,0), 99, + // true, false, false, true); + public static UcdPropertyDetail Indic_Positional_Category_Detail = + new UcdPropertyDetail( + UcdProperty.Indic_Positional_Category, + VersionInfo.getInstance(8, 0, 0), + 100, + true, + false, + false, + true); + public static UcdPropertyDetail kJa_Detail = + new UcdPropertyDetail( + UcdProperty.kJa, + VersionInfo.getInstance(8, 0, 0), + 101, + false, + true, + false, + true); + public static UcdPropertyDetail Prepended_Concatenation_Mark_Detail = + new UcdPropertyDetail( + UcdProperty.Prepended_Concatenation_Mark, + VersionInfo.getInstance(9, 0, 0), + 102, + true, + false, + false, + true); + public static UcdPropertyDetail Vertical_Orientation_Detail = + new UcdPropertyDetail( + UcdProperty.Vertical_Orientation, + VersionInfo.getInstance(10, 0, 0), + 103, + true, + false, + false, + true); + public static UcdPropertyDetail Regional_Indicator_Detail = + new UcdPropertyDetail( + UcdProperty.Regional_Indicator, + VersionInfo.getInstance(10, 0, 0), + 104, + true, + false, + false, + true); + public static UcdPropertyDetail Block_Detail = + new UcdPropertyDetail( + UcdProperty.Block, + VersionInfo.getInstance(2, 0, 0), + 105, + true, + false, + false, + true); + public static UcdPropertyDetail Equivalent_Unified_Ideograph_Detail = + new UcdPropertyDetail( + UcdProperty.Equivalent_Unified_Ideograph, + VersionInfo.getInstance(11, 0, 0), + 106, + false, + true, + false, + true); + public static UcdPropertyDetail kCompatibilityVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kCompatibilityVariant, + VersionInfo.getInstance(3, 2, 0), + 107, + false, + true, + true, + true); + public static UcdPropertyDetail kRSUnicode_Detail = + new UcdPropertyDetail( + UcdProperty.kRSUnicode, + VersionInfo.getInstance(2, 0, 0), + 108, + false, + true, + false, + true); + // public static UcdPropertyDetail kIRG_RSIndex_Detail = new UcdPropertyDetail ( + // UcdProperty.kIRG_RSIndex, VersionInfo.getInstance(11,0,0), 109, + // false, true, false, true); + public static UcdPropertyDetail kIRG_GSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_GSource, + VersionInfo.getInstance(3, 0, 0), + 110, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_TSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_TSource, + VersionInfo.getInstance(3, 0, 0), + 111, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_JSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_JSource, + VersionInfo.getInstance(3, 0, 0), + 112, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_KSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_KSource, + VersionInfo.getInstance(3, 0, 0), + 113, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_KPSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_KPSource, + VersionInfo.getInstance(3, 1, 1), + 114, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_VSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_VSource, + VersionInfo.getInstance(3, 0, 0), + 115, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_HSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_HSource, + VersionInfo.getInstance(3, 1, 0), + 116, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_USource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_USource, + VersionInfo.getInstance(4, 0, 1), + 117, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_MSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_MSource, + VersionInfo.getInstance(5, 2, 0), + 118, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_UKSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_UKSource, + VersionInfo.getInstance(13, 0, 0), + 119, + false, + true, + true, + true); + public static UcdPropertyDetail kIRG_SSource_Detail = + new UcdPropertyDetail( + UcdProperty.kIRG_SSource, + VersionInfo.getInstance(13, 0, 0), + 120, + false, + true, + true, + true); + public static UcdPropertyDetail kIICore_Detail = + new UcdPropertyDetail( + UcdProperty.kIICore, + VersionInfo.getInstance(4, 1, 0), + 121, + false, + true, + false, + true); + public static UcdPropertyDetail kUnihanCore2020_Detail = + new UcdPropertyDetail( + UcdProperty.kUnihanCore2020, + VersionInfo.getInstance(13, 0, 0), + 122, + false, + true, + false, + true); + public static UcdPropertyDetail kGB0_Detail = + new UcdPropertyDetail( + UcdProperty.kGB0, + VersionInfo.getInstance(2, 0, 0), + 123, + false, + true, + false, + true); + public static UcdPropertyDetail kGB1_Detail = + new UcdPropertyDetail( + UcdProperty.kGB1, + VersionInfo.getInstance(2, 0, 0), + 124, + false, + true, + false, + true); + public static UcdPropertyDetail kGB3_Detail = + new UcdPropertyDetail( + UcdProperty.kGB3, + VersionInfo.getInstance(2, 0, 0), + 125, + false, + true, + false, + true); + public static UcdPropertyDetail kGB5_Detail = + new UcdPropertyDetail( + UcdProperty.kGB5, + VersionInfo.getInstance(2, 0, 0), + 126, + false, + true, + false, + true); + public static UcdPropertyDetail kGB7_Detail = + new UcdPropertyDetail( + UcdProperty.kGB7, + VersionInfo.getInstance(2, 0, 0), + 127, + false, + true, + false, + true); + public static UcdPropertyDetail kGB8_Detail = + new UcdPropertyDetail( + UcdProperty.kGB8, + VersionInfo.getInstance(2, 0, 0), + 128, + false, + true, + false, + true); + public static UcdPropertyDetail kCNS1986_Detail = + new UcdPropertyDetail( + UcdProperty.kCNS1986, + VersionInfo.getInstance(2, 0, 0), + 129, + false, + true, + false, + true); + public static UcdPropertyDetail kCNS1992_Detail = + new UcdPropertyDetail( + UcdProperty.kCNS1992, + VersionInfo.getInstance(2, 0, 0), + 130, + false, + true, + false, + true); + public static UcdPropertyDetail kJis0_Detail = + new UcdPropertyDetail( + UcdProperty.kJis0, + VersionInfo.getInstance(2, 0, 0), + 131, + false, + true, + false, + true); + public static UcdPropertyDetail kJis1_Detail = + new UcdPropertyDetail( + UcdProperty.kJis1, + VersionInfo.getInstance(2, 0, 0), + 132, + false, + true, + false, + true); + public static UcdPropertyDetail kJIS0213_Detail = + new UcdPropertyDetail( + UcdProperty.kJIS0213, + VersionInfo.getInstance(3, 1, 1), + 133, + false, + true, + false, + true); + public static UcdPropertyDetail kKSC0_Detail = + new UcdPropertyDetail( + UcdProperty.kKSC0, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 134, + false, + true, + false, + true); + public static UcdPropertyDetail kKSC1_Detail = + new UcdPropertyDetail( + UcdProperty.kKSC1, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 135, + false, + true, + false, + true); + public static UcdPropertyDetail kKPS0_Detail = + new UcdPropertyDetail( + UcdProperty.kKPS0, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 136, + false, + true, + false, + true); + public static UcdPropertyDetail kKPS1_Detail = + new UcdPropertyDetail( + UcdProperty.kKPS1, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 137, + false, + true, + false, + true); + public static UcdPropertyDetail kHKSCS_Detail = + new UcdPropertyDetail( + UcdProperty.kHKSCS, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 138, + false, + true, + false, + true); + public static UcdPropertyDetail kCantonese_Detail = + new UcdPropertyDetail( + UcdProperty.kCantonese, + VersionInfo.getInstance(2, 0, 0), + 139, + false, + true, + false, + true); + public static UcdPropertyDetail kHangul_Detail = + new UcdPropertyDetail( + UcdProperty.kHangul, + VersionInfo.getInstance(5, 0, 0), + 140, + false, + true, + false, + true); + public static UcdPropertyDetail kDefinition_Detail = + new UcdPropertyDetail( + UcdProperty.kDefinition, + VersionInfo.getInstance(2, 0, 0), + 141, + false, + true, + false, + true); + public static UcdPropertyDetail kHanYu_Detail = + new UcdPropertyDetail( + UcdProperty.kHanYu, + VersionInfo.getInstance(2, 0, 0), + 142, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateHanYu_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateHanYu, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(3,1,1), 143, + // false, true, false, true); + public static UcdPropertyDetail kMandarin_Detail = + new UcdPropertyDetail( + UcdProperty.kMandarin, + VersionInfo.getInstance(2, 0, 0), + 144, + false, + true, + false, + true); + public static UcdPropertyDetail kCihaiT_Detail = + new UcdPropertyDetail( + UcdProperty.kCihaiT, + VersionInfo.getInstance(3, 2, 0), + 145, + false, + true, + false, + true); + public static UcdPropertyDetail kSBGY_Detail = + new UcdPropertyDetail( + UcdProperty.kSBGY, + VersionInfo.getInstance(3, 2, 0), + 146, + false, + true, + false, + true); + public static UcdPropertyDetail kNelson_Detail = + new UcdPropertyDetail( + UcdProperty.kNelson, + VersionInfo.getInstance(2, 0, 0), + 147, + false, + true, + false, + true); + public static UcdPropertyDetail kCowles_Detail = + new UcdPropertyDetail( + UcdProperty.kCowles, + VersionInfo.getInstance(3, 1, 1), + 148, + false, + true, + false, + true); + public static UcdPropertyDetail kMatthews_Detail = + new UcdPropertyDetail( + UcdProperty.kMatthews, + VersionInfo.getInstance(2, 0, 0), + 149, + false, + true, + false, + true); + public static UcdPropertyDetail kOtherNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kOtherNumeric, + VersionInfo.getInstance(3, 2, 0), + 150, + false, + true, + false, + true); + public static UcdPropertyDetail kPhonetic_Detail = + new UcdPropertyDetail( + UcdProperty.kPhonetic, + VersionInfo.getInstance(3, 1, 0), + 151, + false, + true, + false, + true); + public static UcdPropertyDetail kGSR_Detail = + new UcdPropertyDetail( + UcdProperty.kGSR, + VersionInfo.getInstance(4, 0, 1), + 152, + false, + true, + false, + true); + public static UcdPropertyDetail kFenn_Detail = + new UcdPropertyDetail( + UcdProperty.kFenn, + VersionInfo.getInstance(3, 1, 1), + 153, + false, + true, + false, + true); + public static UcdPropertyDetail kFennIndex_Detail = + new UcdPropertyDetail( + UcdProperty.kFennIndex, + VersionInfo.getInstance(4, 1, 0), + 154, + false, + true, + false, + true); + public static UcdPropertyDetail kKarlgren_Detail = + new UcdPropertyDetail( + UcdProperty.kKarlgren, + VersionInfo.getInstance(3, 1, 1), + 155, + false, + true, + false, + true); + public static UcdPropertyDetail kCangjie_Detail = + new UcdPropertyDetail( + UcdProperty.kCangjie, + VersionInfo.getInstance(3, 1, 1), + 156, + false, + true, + false, + true); + public static UcdPropertyDetail kMeyerWempe_Detail = + new UcdPropertyDetail( + UcdProperty.kMeyerWempe, + VersionInfo.getInstance(3, 1, 0), + 157, + false, + true, + false, + true); + public static UcdPropertyDetail kSimplifiedVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSimplifiedVariant, + VersionInfo.getInstance(2, 0, 0), + 158, + false, + true, + false, + true); + public static UcdPropertyDetail kTraditionalVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kTraditionalVariant, + VersionInfo.getInstance(2, 0, 0), + 159, + false, + true, + false, + true); + public static UcdPropertyDetail kSpecializedSemanticVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSpecializedSemanticVariant, + VersionInfo.getInstance(2, 0, 0), + 160, + false, + true, + false, + true); + public static UcdPropertyDetail kSemanticVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSemanticVariant, + VersionInfo.getInstance(2, 0, 0), + 161, + false, + true, + false, + true); + public static UcdPropertyDetail kVietnamese_Detail = + new UcdPropertyDetail( + UcdProperty.kVietnamese, + VersionInfo.getInstance(3, 1, 1), + 162, + false, + true, + false, + true); + public static UcdPropertyDetail kLau_Detail = + new UcdPropertyDetail( + UcdProperty.kLau, + VersionInfo.getInstance(3, 1, 1), + 163, + false, + true, + false, + true); + public static UcdPropertyDetail kTang_Detail = + new UcdPropertyDetail( + UcdProperty.kTang, + VersionInfo.getInstance(2, 0, 0), + 164, + false, + true, + false, + true); + public static UcdPropertyDetail kZVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kZVariant, + VersionInfo.getInstance(2, 0, 0), + 165, + false, + true, + false, + true); + public static UcdPropertyDetail kJapaneseKun_Detail = + new UcdPropertyDetail( + UcdProperty.kJapaneseKun, + VersionInfo.getInstance(2, 0, 0), + 166, + false, + true, + false, + true); + public static UcdPropertyDetail kJapaneseOn_Detail = + new UcdPropertyDetail( + UcdProperty.kJapaneseOn, + VersionInfo.getInstance(2, 0, 0), + 167, + false, + true, + false, + true); + public static UcdPropertyDetail kKangXi_Detail = + new UcdPropertyDetail( + UcdProperty.kKangXi, + VersionInfo.getInstance(2, 0, 0), + 168, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateKangXi_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateKangXi, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(4,0,1), 169, + // false, true, false, true); + public static UcdPropertyDetail kBigFive_Detail = + new UcdPropertyDetail( + UcdProperty.kBigFive, + VersionInfo.getInstance(2, 0, 0), + 170, + false, + true, + false, + true); + public static UcdPropertyDetail kCCCII_Detail = + new UcdPropertyDetail( + UcdProperty.kCCCII, + VersionInfo.getInstance(2, 0, 0), + 171, + false, + true, + false, + true); + public static UcdPropertyDetail kDaeJaweon_Detail = + new UcdPropertyDetail( + UcdProperty.kDaeJaweon, + VersionInfo.getInstance(2, 0, 0), + 172, + false, + true, + false, + true); + public static UcdPropertyDetail kEACC_Detail = + new UcdPropertyDetail( + UcdProperty.kEACC, + VersionInfo.getInstance(2, 0, 0), + 173, + false, + true, + false, + true); + public static UcdPropertyDetail kFrequency_Detail = + new UcdPropertyDetail( + UcdProperty.kFrequency, + VersionInfo.getInstance(3, 2, 0), + VersionInfo.getInstance(16, 0, 0), + 174, + false, + true, + false, + true); + public static UcdPropertyDetail kGradeLevel_Detail = + new UcdPropertyDetail( + UcdProperty.kGradeLevel, + VersionInfo.getInstance(3, 2, 0), + 175, + false, + true, + false, + true); + public static UcdPropertyDetail kHDZRadBreak_Detail = + new UcdPropertyDetail( + UcdProperty.kHDZRadBreak, + VersionInfo.getInstance(4, 1, 0), + 176, + false, + true, + false, + true); + public static UcdPropertyDetail kHKGlyph_Detail = + new UcdPropertyDetail( + UcdProperty.kHKGlyph, + VersionInfo.getInstance(3, 1, 1), + 177, + false, + true, + false, + true); + public static UcdPropertyDetail kHanyuPinlu_Detail = + new UcdPropertyDetail( + UcdProperty.kHanyuPinlu, + VersionInfo.getInstance(4, 0, 1), + 178, + false, + true, + false, + true); + public static UcdPropertyDetail kHanyuPinyin_Detail = + new UcdPropertyDetail( + UcdProperty.kHanyuPinyin, + VersionInfo.getInstance(5, 2, 0), + 179, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGHanyuDaZidian_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGHanyuDaZidian, + VersionInfo.getInstance(3, 0, 0), + 180, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGKangXi_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGKangXi, + VersionInfo.getInstance(3, 0, 0), + 181, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGDaeJaweon_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGDaeJaweon, + VersionInfo.getInstance(3, 0, 0), + 182, + false, + true, + false, + true); + public static UcdPropertyDetail kIRGDaiKanwaZiten_Detail = + new UcdPropertyDetail( + UcdProperty.kIRGDaiKanwaZiten, + VersionInfo.getInstance(3, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 183, + false, + true, + false, + true); + public static UcdPropertyDetail kKorean_Detail = + new UcdPropertyDetail( + UcdProperty.kKorean, + VersionInfo.getInstance(2, 0, 0), + 184, + false, + true, + false, + true); + public static UcdPropertyDetail kMainlandTelegraph_Detail = + new UcdPropertyDetail( + UcdProperty.kMainlandTelegraph, + VersionInfo.getInstance(2, 0, 0), + 185, + false, + true, + false, + true); + public static UcdPropertyDetail kMorohashi_Detail = + new UcdPropertyDetail( + UcdProperty.kMorohashi, + VersionInfo.getInstance(2, 0, 0), + 186, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateMorohashi_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateMorohashi, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(4,0,1), 187, + // false, true, false, true); + public static UcdPropertyDetail kPrimaryNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kPrimaryNumeric, + VersionInfo.getInstance(3, 2, 0), + 188, + false, + true, + false, + true); + public static UcdPropertyDetail kTaiwanTelegraph_Detail = + new UcdPropertyDetail( + UcdProperty.kTaiwanTelegraph, + VersionInfo.getInstance(2, 0, 0), + 189, + false, + true, + false, + true); + public static UcdPropertyDetail kXerox_Detail = + new UcdPropertyDetail( + UcdProperty.kXerox, + VersionInfo.getInstance(2, 0, 0), + 190, + false, + true, + false, + true); + public static UcdPropertyDetail kPseudoGB1_Detail = + new UcdPropertyDetail( + UcdProperty.kPseudoGB1, + VersionInfo.getInstance(2, 0, 0), + 191, + false, + true, + false, + true); + public static UcdPropertyDetail kIBMJapan_Detail = + new UcdPropertyDetail( + UcdProperty.kIBMJapan, + VersionInfo.getInstance(2, 0, 0), + 192, + false, + true, + false, + true); + public static UcdPropertyDetail kAccountingNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kAccountingNumeric, + VersionInfo.getInstance(3, 2, 0), + 193, + false, + true, + false, + true); + public static UcdPropertyDetail kCheungBauer_Detail = + new UcdPropertyDetail( + UcdProperty.kCheungBauer, + VersionInfo.getInstance(5, 0, 0), + 194, + false, + true, + false, + true); + public static UcdPropertyDetail kCheungBauerIndex_Detail = + new UcdPropertyDetail( + UcdProperty.kCheungBauerIndex, + VersionInfo.getInstance(5, 0, 0), + 195, + false, + true, + false, + true); + public static UcdPropertyDetail kFourCornerCode_Detail = + new UcdPropertyDetail( + UcdProperty.kFourCornerCode, + VersionInfo.getInstance(5, 0, 0), + 196, + false, + true, + false, + true); + // public static UcdPropertyDetail kWubi_Detail = new UcdPropertyDetail ( + // UcdProperty.kWubi, VersionInfo.getInstance(11,0,0), 197, + // false, true, false, true); + public static UcdPropertyDetail kXHC1983_Detail = + new UcdPropertyDetail( + UcdProperty.kXHC1983, + VersionInfo.getInstance(5, 1, 0), + 198, + false, + true, + false, + true); + public static UcdPropertyDetail kJinmeiyoKanji_Detail = + new UcdPropertyDetail( + UcdProperty.kJinmeiyoKanji, + VersionInfo.getInstance(11, 0, 0), + 199, + false, + true, + false, + true); + public static UcdPropertyDetail kJoyoKanji_Detail = + new UcdPropertyDetail( + UcdProperty.kJoyoKanji, + VersionInfo.getInstance(11, 0, 0), + 200, + false, + true, + false, + true); + public static UcdPropertyDetail kKoreanEducationHanja_Detail = + new UcdPropertyDetail( + UcdProperty.kKoreanEducationHanja, + VersionInfo.getInstance(11, 0, 0), + 201, + false, + true, + false, + true); + public static UcdPropertyDetail kKoreanName_Detail = + new UcdPropertyDetail( + UcdProperty.kKoreanName, + VersionInfo.getInstance(11, 0, 0), + 202, + false, + true, + false, + true); + public static UcdPropertyDetail kTGH_Detail = + new UcdPropertyDetail( + UcdProperty.kTGH, + VersionInfo.getInstance(11, 0, 0), + 203, + false, + true, + false, + true); + public static UcdPropertyDetail kTGHZ2013_Detail = + new UcdPropertyDetail( + UcdProperty.kTGHZ2013, + VersionInfo.getInstance(13, 0, 0), + 204, + false, + true, + false, + true); + public static UcdPropertyDetail kSpoofingVariant_Detail = + new UcdPropertyDetail( + UcdProperty.kSpoofingVariant, + VersionInfo.getInstance(13, 0, 0), + 205, + false, + true, + false, + true); + public static UcdPropertyDetail kRSKanWa_Detail = + new UcdPropertyDetail( + UcdProperty.kRSKanWa, + VersionInfo.getInstance(2, 0, 0), + 206, + false, + true, + false, + true); + public static UcdPropertyDetail kRSJapanese_Detail = + new UcdPropertyDetail( + UcdProperty.kRSJapanese, + VersionInfo.getInstance(2, 0, 0), + 207, + false, + true, + false, + true); + public static UcdPropertyDetail kRSKorean_Detail = + new UcdPropertyDetail( + UcdProperty.kRSKorean, + VersionInfo.getInstance(2, 0, 0), + 208, + false, + true, + false, + true); + public static UcdPropertyDetail kRSKangXi_Detail = + new UcdPropertyDetail( + UcdProperty.kRSKangXi, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 209, + false, + true, + false, + true); + public static UcdPropertyDetail kRSAdobe_Japan1_6_Detail = + new UcdPropertyDetail( + UcdProperty.kRSAdobe_Japan1_6, + VersionInfo.getInstance(4, 1, 0), + 210, + false, + true, + false, + true); + public static UcdPropertyDetail kTotalStrokes_Detail = + new UcdPropertyDetail( + UcdProperty.kTotalStrokes, + VersionInfo.getInstance(3, 1, 0), + 211, + false, + true, + false, + true); + public static UcdPropertyDetail kRSTUnicode_Detail = + new UcdPropertyDetail( + UcdProperty.kRSTUnicode, + VersionInfo.getInstance(9, 0, 0), + 212, + false, + true, + false, + true); + public static UcdPropertyDetail kTGT_MergedSrc_Detail = + new UcdPropertyDetail( + UcdProperty.kTGT_MergedSrc, + VersionInfo.getInstance(9, 0, 0), + 213, + false, + true, + false, + true); + public static UcdPropertyDetail kSrc_NushuDuben_Detail = + new UcdPropertyDetail( + UcdProperty.kSrc_NushuDuben, + VersionInfo.getInstance(10, 0, 0), + 214, + false, + true, + false, + true); + public static UcdPropertyDetail kReading_Detail = + new UcdPropertyDetail( + UcdProperty.kReading, + VersionInfo.getInstance(10, 0, 0), + 215, + false, + true, + false, + true); + public static UcdPropertyDetail ISO_Comment_Detail = + new UcdPropertyDetail( + UcdProperty.ISO_Comment, + VersionInfo.getInstance(11, 0, 0), + 216, + true, + false, + false, + true); + public static UcdPropertyDetail Unicode_1_Name_Detail = + new UcdPropertyDetail( + UcdProperty.Unicode_1_Name, + VersionInfo.getInstance(2, 0, 0), + 217, + true, + false, + false, + true); + public static UcdPropertyDetail Name_Alias_Detail = + new UcdPropertyDetail( + UcdProperty.Name_Alias, + VersionInfo.getInstance(5, 0, 0), + 218, + false, + false, + false, + true); + public static UcdPropertyDetail Emoji_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji, + VersionInfo.getInstance(13, 0, 0), + 219, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Presentation_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Presentation, + VersionInfo.getInstance(13, 0, 0), + 220, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Modifier_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Modifier, + VersionInfo.getInstance(13, 0, 0), + 221, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Modifier_Base_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Modifier_Base, + VersionInfo.getInstance(13, 0, 0), + 222, + true, + false, + false, + true); + public static UcdPropertyDetail Emoji_Component_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_Component, + VersionInfo.getInstance(13, 0, 0), + 223, + true, + false, + false, + true); + public static UcdPropertyDetail Extended_Pictographic_Detail = + new UcdPropertyDetail( + UcdProperty.Extended_Pictographic, + VersionInfo.getInstance(13, 0, 0), + 224, + true, + false, + false, + true); + public static UcdPropertyDetail kStrange_Detail = + new UcdPropertyDetail( + UcdProperty.kStrange, + VersionInfo.getInstance(14, 0, 0), + 225, + false, + true, + false, + true); + public static UcdPropertyDetail kAlternateTotalStrokes_Detail = + new UcdPropertyDetail( + UcdProperty.kAlternateTotalStrokes, + VersionInfo.getInstance(15, 0, 0), + 226, + false, + true, + false, + true); + public static UcdPropertyDetail NFKC_Simple_Casefold_Detail = + new UcdPropertyDetail( + UcdProperty.NFKC_Simple_Casefold, + VersionInfo.getInstance(15, 1, 0), + 227, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Compat_Math_Start_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Compat_Math_Start, + VersionInfo.getInstance(15, 1, 0), + 228, + true, + false, + false, + true); + public static UcdPropertyDetail ID_Compat_Math_Continue_Detail = + new UcdPropertyDetail( + UcdProperty.ID_Compat_Math_Continue, + VersionInfo.getInstance(15, 1, 0), + 229, + true, + false, + false, + true); + public static UcdPropertyDetail IDS_Unary_Operator_Detail = + new UcdPropertyDetail( + UcdProperty.IDS_Unary_Operator, + VersionInfo.getInstance(15, 1, 0), + 230, + true, + false, + false, + true); + public static UcdPropertyDetail kJapanese_Detail = + new UcdPropertyDetail( + UcdProperty.kJapanese, + VersionInfo.getInstance(15, 1, 0), + 231, + false, + true, + false, + true); + public static UcdPropertyDetail kMojiJoho_Detail = + new UcdPropertyDetail( + UcdProperty.kMojiJoho, + VersionInfo.getInstance(15, 1, 0), + 232, + false, + true, + false, + true); + public static UcdPropertyDetail kSMSZD2003Index_Detail = + new UcdPropertyDetail( + UcdProperty.kSMSZD2003Index, + VersionInfo.getInstance(15, 1, 0), + 233, + false, + true, + false, + true); + public static UcdPropertyDetail kSMSZD2003Readings_Detail = + new UcdPropertyDetail( + UcdProperty.kSMSZD2003Readings, + VersionInfo.getInstance(15, 1, 0), + 234, + false, + true, + false, + true); + public static UcdPropertyDetail kVietnameseNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kVietnameseNumeric, + VersionInfo.getInstance(15, 1, 0), + 235, + false, + true, + false, + true); + public static UcdPropertyDetail kZhuangNumeric_Detail = + new UcdPropertyDetail( + UcdProperty.kZhuangNumeric, + VersionInfo.getInstance(15, 1, 0), + 236, + false, + true, + false, + true); + public static UcdPropertyDetail Indic_Conjunct_Break_Detail = + new UcdPropertyDetail( + UcdProperty.Indic_Conjunct_Break, + VersionInfo.getInstance(15, 1, 0), + 237, + true, + false, + false, + true); + public static UcdPropertyDetail Modifier_Combining_Mark_Detail = + new UcdPropertyDetail( + UcdProperty.Modifier_Combining_Mark, + VersionInfo.getInstance(16, 0, 0), + 238, + true, + false, + false, + true); + public static UcdPropertyDetail kFanqie_Detail = + new UcdPropertyDetail( + UcdProperty.kFanqie, + VersionInfo.getInstance(16, 0, 0), + 239, + false, + true, + false, + true); + public static UcdPropertyDetail kZhuang_Detail = + new UcdPropertyDetail( + UcdProperty.kZhuang, + VersionInfo.getInstance(16, 0, 0), + 240, + false, + true, + false, + true); + public static UcdPropertyDetail Basic_Emoji_Detail = + new UcdPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); + public static UcdPropertyDetail CJK_Radical_Detail = + new UcdPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); + public static UcdPropertyDetail Confusable_MA_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); + public static UcdPropertyDetail Confusable_ML_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); + public static UcdPropertyDetail Confusable_SA_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); + public static UcdPropertyDetail Confusable_SL_Detail = + new UcdPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); + public static UcdPropertyDetail Do_Not_Emit_Preferred_Detail = + new UcdPropertyDetail( + UcdProperty.Do_Not_Emit_Preferred, -7, false, false, false, false); + public static UcdPropertyDetail Do_Not_Emit_Type_Detail = + new UcdPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); + public static UcdPropertyDetail Emoji_DCM_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_DCM, + VersionInfo.getInstance(6, 0, 0), + -9, + false, + false, + false, + false); + public static UcdPropertyDetail Emoji_KDDI_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_KDDI, + VersionInfo.getInstance(6, 0, 0), + -10, + false, + false, + false, + false); + public static UcdPropertyDetail Emoji_SB_Detail = + new UcdPropertyDetail( + UcdProperty.Emoji_SB, + VersionInfo.getInstance(6, 0, 0), + -11, + false, + false, + false, + false); + public static UcdPropertyDetail Identifier_Status_Detail = + new UcdPropertyDetail( + UcdProperty.Identifier_Status, + VersionInfo.getInstance(9, 0, 0), + -12, + false, + false, + false, + false); + public static UcdPropertyDetail Identifier_Type_Detail = + new UcdPropertyDetail( + UcdProperty.Identifier_Type, + VersionInfo.getInstance(9, 0, 0), + -13, + false, + false, + false, + false); + public static UcdPropertyDetail Idn_2008_Detail = + new UcdPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); + public static UcdPropertyDetail Idn_Mapping_Detail = + new UcdPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); + public static UcdPropertyDetail Idn_Status_Detail = + new UcdPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); + public static UcdPropertyDetail Named_Sequences_Detail = + new UcdPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); + public static UcdPropertyDetail Named_Sequences_Prov_Detail = + new UcdPropertyDetail( + UcdProperty.Named_Sequences_Prov, -18, false, false, false, false); + public static UcdPropertyDetail Other_Joining_Type_Detail = + new UcdPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Flag_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Flag_Sequence, -20, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Keycap_Sequence, -21, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Modifier_Sequence, -22, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Tag_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Tag_Sequence, -23, false, false, false, false); + public static UcdPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = + new UcdPropertyDetail( + UcdProperty.RGI_Emoji_Zwj_Sequence, -24, false, false, false, false); + public static UcdPropertyDetail Standardized_Variant_Detail = + new UcdPropertyDetail( + UcdProperty.Standardized_Variant, -25, false, false, false, false); + + private UcdProperty ucdProperty; + private VersionInfo minVersion; + private VersionInfo maxVersion; + private int sortOrder; + private boolean isBaseAttribute; + private boolean isCJKAttribute; + private boolean isCJKShowIfEmpty; + private boolean isOrgUCDXMLAttribute; + + private UcdPropertyDetail( + UcdProperty ucdProperty, + VersionInfo minVersion, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this( + ucdProperty, + minVersion, + null, + sortOrder, + isBaseAttribute, + isCJKAttribute, + isCJKShowIfEmpty, + isOrgUCDXMLAttribute); + } + + private UcdPropertyDetail( + UcdProperty ucdProperty, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this( + ucdProperty, + null, + null, + sortOrder, + isBaseAttribute, + isCJKAttribute, + isCJKShowIfEmpty, + isOrgUCDXMLAttribute); + } + + private UcdPropertyDetail( + UcdProperty ucdProperty, + VersionInfo minVersion, + VersionInfo maxVersion, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this.ucdProperty = ucdProperty; + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.sortOrder = sortOrder; + this.isBaseAttribute = isBaseAttribute; + this.isCJKAttribute = isCJKAttribute; + this.isCJKShowIfEmpty = isCJKShowIfEmpty; + this.isOrgUCDXMLAttribute = isOrgUCDXMLAttribute; + + allPropertyDetails.add(this); + if (isBaseAttribute) { + basePropertyDetails.add(this); + ucdxmlPropertyDetails.add(this); + } + if (isCJKAttribute) { + cjkPropertyDetails.add(this); + ucdxmlPropertyDetails.add(this); + } + } + + public static Set values() { + return allPropertyDetails; + } + + public static Set baseValues() { + return basePropertyDetails; + } + + public static Set cjkValues() { + return cjkPropertyDetails; + } + + public static Set ucdxmlValues() { + return ucdxmlPropertyDetails; + } + + public UcdProperty getUcdProperty() { + return this.ucdProperty; + } + + public VersionInfo getMinVersion() { + return this.minVersion; + } + + public VersionInfo getMaxVersion() { + return this.maxVersion; + } + + public boolean isBaseAttribute() { + return this.isBaseAttribute; + } + + public boolean isCJKAttribute() { + return this.isCJKAttribute; + } + + public boolean isCJKShowIfEmpty() { + return this.isCJKShowIfEmpty; + } + + public boolean isOrgUCDXMLAttribute() { + return this.isOrgUCDXMLAttribute; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java b/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java new file mode 100644 index 0000000000..0773486ccf --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java @@ -0,0 +1,28 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdProperty; + +public class UcdSectionComponent { + private final VersionInfo minVersion; + private final VersionInfo maxVersion; + private final UcdProperty ucdProperty; + + UcdSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.ucdProperty = ucdProperty; + } + + public VersionInfo getMinVersion() { + return this.minVersion; + } + + public VersionInfo getMaxVersion() { + return this.maxVersion; + } + + public UcdProperty getUcdProperty() { + return this.ucdProperty; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java new file mode 100644 index 0000000000..ceed693afd --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java @@ -0,0 +1,224 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdProperty; + +public class UcdSectionDetail { + + public enum UcdSection { + BLOCKS( + "blocks", + "block", + VersionInfo.getInstance(1, 1, 0), + null, + Blocks_Detail, + true, + true), + CJKRADICALS( + "cjk-radicals", + "cjk-radical", + VersionInfo.getInstance(1, 1, 0), + null, + CJKRadicals_Detail, + false, + false), + DONOTEMIT( + "do-not-emit", + "instead", + VersionInfo.getInstance(16, 0, 0), + null, + DoNotEmit_Detail, + false, + false), + EMOJISOURCES( + "emoji-sources", + "emoji-source", + VersionInfo.getInstance(1, 1, 0), + null, + EmojiSources_Detail, + true, + false), + NAMEDSEQUENCES( + "named-sequences", + "named-sequence", + VersionInfo.getInstance(1, 1, 0), + null, + NamedSequences_Detail, + false, + false), + PROVISIONALNAMEDSEQUENCES( + "provisional-named-sequences", + "named-sequence", + VersionInfo.getInstance(5, 0, 0), + VersionInfo.getInstance(13, 0, 0), + ProvisionalNamedSequences_Detail, + false, + false), + NORMALIZATIONCORRECTIONS( + "normalization-corrections", + "normalization-correction", + VersionInfo.getInstance(1, 1, 0), + null, + NormalizationCorrections_Detail, + true, + false), + STANDARDIZEDVARIANTS( + "standardized-variants", + "standardized-variant", + VersionInfo.getInstance(1, 1, 0), + null, + StandardizedVariants_Detail, + true, + false); + private final String tag; + private final String childTag; + private final VersionInfo minVersion; + private final VersionInfo maxVersion; + private final UcdSectionDetail ucdSectionDetail; + private final boolean parserWithRange; + private final boolean parserWithMissing; + + UcdSection( + String tag, + String childTag, + VersionInfo minVersion, + VersionInfo maxVersion, + UcdSectionDetail ucdSectionDetail, + boolean parserWithRange, + boolean parserWithMissing) { + this.tag = tag; + this.childTag = childTag; + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.ucdSectionDetail = ucdSectionDetail; + this.parserWithRange = parserWithRange; + this.parserWithMissing = parserWithMissing; + } + + public String toString() { + return tag; + } + + public String getChildTag() { + return childTag; + } + + public VersionInfo getMinVersion() { + return minVersion; + } + + public VersionInfo getMaxVersion() { + return maxVersion; + } + + public UcdSectionDetail getUcdSectionDetail() { + return ucdSectionDetail; + } + + public boolean getParserWithRange() { + return parserWithRange; + } + + public boolean getParserWithMissing() { + return parserWithMissing; + } + } + + public static UcdSectionDetail Blocks_Detail = + new UcdSectionDetail( + UcdSection.BLOCKS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Block) + }, + 0); + public static UcdSectionDetail NamedSequences_Detail = + new UcdSectionDetail( + UcdSection.NAMEDSEQUENCES, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Named_Sequences) + }, + 1); + public static UcdSectionDetail ProvisionalNamedSequences_Detail = + new UcdSectionDetail( + UcdSection.PROVISIONALNAMEDSEQUENCES, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(5, 0, 0), + VersionInfo.getInstance(13, 0, 0), + UcdProperty.Named_Sequences_Prov) + }, + 1); + public static UcdSectionDetail NormalizationCorrections_Detail = + new UcdSectionDetail( + UcdSection.NORMALIZATIONCORRECTIONS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.NC_Original) + }, + 2); + public static UcdSectionDetail StandardizedVariants_Detail = + new UcdSectionDetail( + UcdSection.STANDARDIZEDVARIANTS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.Standardized_Variant), + new UcdSectionComponent( + VersionInfo.getInstance(13, 0, 0), + null, + UcdProperty.emoji_variation_sequence) + }, + 3); + public static UcdSectionDetail CJKRadicals_Detail = + new UcdSectionDetail( + UcdSection.CJKRADICALS, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.CJK_Radical) + }, + 4); + public static UcdSectionDetail EmojiSources_Detail = + new UcdSectionDetail( + UcdSection.EMOJISOURCES, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Emoji_DCM) + }, + 5); + public static UcdSectionDetail DoNotEmit_Detail = + new UcdSectionDetail( + UcdSection.DONOTEMIT, + new UcdSectionComponent[] { + new UcdSectionComponent( + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.Do_Not_Emit_Type) + }, + 6); + + private final UcdSection ucdSection; + private final UcdSectionComponent[] ucdSectionComponents; + private final int sortOrder; + + private UcdSectionDetail( + UcdSection ucdSection, UcdSectionComponent[] ucdSectionComponents, int sortOrder) { + this.ucdSection = ucdSection; + this.ucdSectionComponents = ucdSectionComponents; + this.sortOrder = sortOrder; + } + + public UcdSection getSection() { + return this.ucdSection; + } + + public UcdSectionComponent[] getUcdSectionComponents() { + return this.ucdSectionComponents; + } + + public int getSortOrder() { + return this.sortOrder; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdXML.java b/unicodetools/src/main/java/org/unicode/xml/UcdXML.java new file mode 100644 index 0000000000..c71ac10826 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UcdXML.java @@ -0,0 +1,825 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.util.VersionInfo; +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.xml.transform.TransformerConfigurationException; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class UcdXML { + + private static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; + + private enum UCDXMLOUTPUTRANGE { + ALL, + NOUNIHAN, + UNIHAN; + } + + private enum UCDXMLOUTPUTTYPE { + FLAT, + GROUPED; + } + + private enum Range { + RESERVED("reserved"), + SURROGATE("surrogate"), + NONCHARACTER("noncharacter"), + CHARACTER("char"), + CJKUNIFIEDIDEOGRAPH("char"), + NONRANGE("nonrange"); + + private final String tag; + + Range(String tag) { + this.tag = tag; + } + + public String toString() { + return tag; + } + } + + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("ucdversion", 'v', UOption.REQUIRES_ARG), + UOption.create("range", 'r', UOption.REQUIRES_ARG), + UOption.create("output", 'o', UOption.REQUIRES_ARG), + UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + }; + private static final int HELP = 0, UCDVERSION = 1, RANGE = 2, OUTPUT = 3, OUTPUTFOLDER = 4; + + public static void main(String[] args) throws Exception { + + VersionInfo ucdVersion = null; + UCDXMLOUTPUTRANGE[] ucdxmloutputranges = + new UCDXMLOUTPUTRANGE[] { + UCDXMLOUTPUTRANGE.ALL, UCDXMLOUTPUTRANGE.NOUNIHAN, UCDXMLOUTPUTRANGE.UNIHAN + }; + UCDXMLOUTPUTTYPE[] ucdxmloutputtypes = + new UCDXMLOUTPUTTYPE[] {UCDXMLOUTPUTTYPE.FLAT, UCDXMLOUTPUTTYPE.GROUPED}; + File destinationFolder = null; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println( + "UcdXML --ucdversion {version number} --outputfolder {destination} " + + "--range [ALL|NOUNIHAN|UNIHAN] --output [FLAT|GROUPED]"); + System.exit(0); + } + + try { + if (options[UCDVERSION].doesOccur) { + try { + ucdVersion = VersionInfo.getInstance(options[UCDVERSION].value); + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[UCDVERSION].value + + " to a valid UCD version"); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --ucdversion (or -v)"); + } + if (options[RANGE].doesOccur) { + try { + ucdxmloutputranges = + new UCDXMLOUTPUTRANGE[] { + UCDXMLOUTPUTRANGE.valueOf( + options[RANGE].value.toUpperCase(Locale.ROOT)) + }; + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[RANGE].value + + " to one of [ALL|NOUNIHAN|UNIHAN]"); + } + } + if (options[OUTPUT].doesOccur) { + try { + ucdxmloutputtypes = + new UCDXMLOUTPUTTYPE[] { + UCDXMLOUTPUTTYPE.valueOf( + options[OUTPUT].value.toUpperCase(Locale.ROOT)) + }; + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[OUTPUT].value + + " to one of [FLAT|GROUPED]"); + } + } + if (options[OUTPUTFOLDER].doesOccur) { + try { + destinationFolder = + new File( + options[OUTPUTFOLDER].value + + getVersionString(ucdVersion, 3) + + "\\xmltest\\"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdir()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + options[OUTPUTFOLDER].value); + } + } else { + throw new IllegalArgumentException( + "Missing command line option: --outputfolder (or -f)"); + } + + } catch (Exception e) { + System.err.println(e.getMessage()); + System.exit(1); + } + + if (ucdVersion != null && destinationFolder.exists()) { + for (UCDXMLOUTPUTRANGE ucdxmloutputrange : ucdxmloutputranges) { + for (UCDXMLOUTPUTTYPE ucdxmloutputtype : ucdxmloutputtypes) { + System.out.println( + "Building the " + + ucdxmloutputrange + + " " + + ucdxmloutputtype + + " UcdXML file for " + + ucdVersion); + buildUcdXMLFile( + ucdVersion, destinationFolder, ucdxmloutputrange, ucdxmloutputtype); + } + } + System.out.println("End"); + System.exit(0); + } else { + System.err.println("Unexpected error when building UcdXML file."); + System.exit(1); + } + } + + private static void buildUcdXMLFile( + VersionInfo ucdVersion, + File destinationFolder, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws IOException, TransformerConfigurationException, SAXException { + int lowCodepoint = 0x0; + int highCodepoint = 0x10FFFF; + // Tangut + // int lowCodepoint = 0x17000; + // int highCodepoint = 0x1B2FB; + // 0x10FFFF + + File tempFile = new File(destinationFolder, "temp.xml"); + String outputFilename = + "ucd." + + outputRange.toString().toLowerCase(Locale.ROOT) + + "." + + outputType.toString().toLowerCase(Locale.ROOT) + + ".xml"; + File destinationFile = new File(destinationFolder, outputFilename); + + FileOutputStream fileOutputStream = new FileOutputStream(tempFile); + UCDXMLWriter writer = new UCDXMLWriter(fileOutputStream); + + IndexUnicodeProperties iup = IndexUnicodeProperties.make(ucdVersion); + AttributeResolver attributeResolver = new AttributeResolver(iup); + UCDDataResolver ucdDataResolver = new UCDDataResolver(iup, NAMESPACE, writer); + + writer.startFile(); + writer.startElement("ucd"); + { + writer.startElement("description"); + { + writer.addContent("Unicode " + getVersionString(ucdVersion, 3)); + writer.endElement("description"); + } + buildRepertoire( + writer, + attributeResolver, + ucdVersion, + lowCodepoint, + highCodepoint, + outputRange, + outputType); + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.BLOCKS); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NAMEDSEQUENCES); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.STANDARDIZEDVARIANTS); + if (ucdVersion.compareTo(VersionInfo.getInstance(5, 2, 0)) >= 0) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.CJKRADICALS); + } + if (ucdVersion.compareTo(VersionInfo.getInstance(6, 0, 0)) >= 0) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.EMOJISOURCES); + } + if (ucdVersion.compareTo(VersionInfo.getInstance(16, 0, 0)) >= 0) { + ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.DONOTEMIT); + } + } + writer.endElement("ucd"); + } + writer.endFile(); + fileOutputStream.close(); + cleanUcdXMLFile(tempFile, destinationFile); + if (!tempFile.delete()) { + throw new IOException("Could not delete temporary file " + tempFile); + } + } + + private static void cleanUcdXMLFile(File tempFile, File destinationFile) throws IOException { + // XALAN writes out characters outside the BMP as entities. + // Use this code to replace the entities with the correct characters. + // See: https://issues.apache.org/jira/browse/XALANJ-2595 + + FileInputStream fileInputStream = new FileInputStream(tempFile); + FileOutputStream fileOutputStream = new FileOutputStream(destinationFile); + + InputStreamReader inputStreamReader = + new InputStreamReader(fileInputStream, StandardCharsets.UTF_8); + OutputStreamWriter outputStreamWriter = + new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); + + BufferedReader bufferedReader = new BufferedReader(inputStreamReader); + BufferedWriter bufferedWriter = new BufferedWriter(outputStreamWriter); + + String line; + while ((line = bufferedReader.readLine()) != null) { + Matcher matcher = Pattern.compile("&#(\\d+);").matcher(line); + line = + matcher.replaceAll( + matchResult -> + new String( + Character.toChars(Integer.parseInt(matcher.group(1))))); + bufferedWriter.append(line); + bufferedWriter.newLine(); + } + bufferedWriter.flush(); + fileInputStream.close(); + fileOutputStream.close(); + } + + private static void buildRepertoire( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws SAXException { + + writer.startElement("repertoire"); + { + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + codepoint = + buildGroup( + writer, + attributeResolver, + ucdVersion, + codepoint, + highCodepoint, + outputRange, + outputType); + } else { + codepoint = + buildChars( + writer, + attributeResolver, + ucdVersion, + codepoint, + highCodepoint, + outputRange, + outputType, + null); + } + } + } + writer.endElement("repertoire"); + } + } + + private static int buildGroup( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws SAXException { + + int lastCodepointInGroup = + getLastCodepointInGroup(attributeResolver, lowCodepoint, highCodepoint); + + AttributesImpl groupAttrs = + getGroupAttributes( + ucdVersion, + attributeResolver, + lowCodepoint, + lastCodepointInGroup, + outputRange); + + writer.startElement("group", groupAttrs); + { + buildChars( + writer, + attributeResolver, + ucdVersion, + lowCodepoint, + lastCodepointInGroup, + outputRange, + outputType, + groupAttrs); + writer.endElement("group"); + } + return lastCodepointInGroup; + } + + private static int buildChars( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType, + AttributesImpl groupAttrs) + throws SAXException { + + ArrayList range = new ArrayList<>(); + Range rangeType = Range.NONRANGE; + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (attributeResolver.isUnassignedCodepoint(codepoint) + || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN + && attributeResolver.isUnifiedIdeograph(codepoint))) { + Range currentRangeType = getRangeType(attributeResolver, codepoint); + if (!range.isEmpty()) { + if (!currentRangeType.equals(rangeType) + || attributeResolver.isDifferentRange( + ucdVersion, codepoint, codepoint - 1)) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, + attributeResolver, + ucdVersion, + range, + rangeType, + groupAttrs); + } else { + buildUngroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType); + } + } + range.clear(); + } + } + range.add(codepoint); + rangeType = currentRangeType; + } else { + if (!range.isEmpty()) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, + attributeResolver, + ucdVersion, + range, + rangeType, + groupAttrs); + } else { + buildUngroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType); + } + } + range.clear(); + rangeType = Range.NONRANGE; + } + if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedChar( + writer, + attributeResolver, + ucdVersion, + codepoint, + outputRange, + groupAttrs); + } else { + buildUngroupedChar( + writer, attributeResolver, ucdVersion, codepoint, outputRange); + } + } + } + } + // Handle any range before the end of the repertoire element. + if (!range.isEmpty()) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType, groupAttrs); + } else { + buildUngroupedRange(writer, attributeResolver, ucdVersion, range, rangeType); + } + } + } + return highCodepoint; + } + + private static void buildUngroupedChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int codepoint, + UCDXMLOUTPUTRANGE outputRange) + throws SAXException { + + AttributesImpl charAttributes = + getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); + buildChar(writer, attributeResolver, codepoint, charAttributes); + } + + private static void buildGroupedChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int codepoint, + UCDXMLOUTPUTRANGE outputRange, + AttributesImpl groupAttrs) + throws SAXException { + + AttributesImpl orgCharAttributes = + getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); + AttributesImpl charAttributes = new AttributesImpl(); + charAttributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + String qName = propDetail.getUcdProperty().getShortName(); + if (qName.startsWith("cjk")) { + qName = qName.substring(2); + } + String orgCharAttributesValue = orgCharAttributes.getValue(qName); + String groupAttributeValue = groupAttrs.getValue(qName); + if (!Objects.equals(orgCharAttributesValue, groupAttributeValue)) { + charAttributes.addAttribute( + NAMESPACE, + qName, + qName, + "CDATA", + Objects.requireNonNullElse(orgCharAttributesValue, "")); + } + } + buildChar(writer, attributeResolver, codepoint, charAttributes); + } + + private static void buildChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + int codepoint, + AttributesImpl charAttributes) + throws SAXException { + writer.startElement("char", charAttributes); + { + HashMap nameAliases = attributeResolver.getNameAliases(codepoint); + if (null != nameAliases && !nameAliases.isEmpty()) { + for (String alias : nameAliases.keySet()) { + AttributesImpl nameAliasAt = new AttributesImpl(); + nameAliasAt.addAttribute(NAMESPACE, "alias", "alias", "CDATA", alias); + String type = nameAliases.get(alias); + if (!Objects.equals(type, "none")) { + nameAliasAt.addAttribute( + NAMESPACE, "type", "type", "CDATA", nameAliases.get(alias)); + } + writer.startElement("name-alias", nameAliasAt); + { + writer.endElement("name-alias"); + } + } + } + writer.endElement("char"); + } + } + + private static void buildGroupedRange( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + ArrayList range, + Range rangeType, + AttributesImpl groupAttrs) + throws SAXException { + AttributesImpl orgRangeAttributes = + getReservedAttributes(ucdVersion, attributeResolver, range); + AttributesImpl rangeAttributes = new AttributesImpl(); + if (range.size() == 1) { + rangeAttributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(range.get(0))); + } else { + rangeAttributes.addAttribute( + NAMESPACE, + "first-cp", + "first-cp", + "CDATA", + attributeResolver.getHexString(range.get(0))); + rangeAttributes.addAttribute( + NAMESPACE, + "last-cp", + "last-cp", + "CDATA", + attributeResolver.getHexString(range.get(range.size() - 1))); + } + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + String qName = propDetail.getUcdProperty().getShortName(); + if (qName.startsWith("cjk")) { + qName = qName.substring(2); + } + String orgCharAttributesValue = orgRangeAttributes.getValue(qName); + String groupAttributeValue = groupAttrs.getValue(qName); + if (!Objects.equals(orgCharAttributesValue, groupAttributeValue)) { + rangeAttributes.addAttribute( + NAMESPACE, + qName, + qName, + "CDATA", + Objects.requireNonNullElse(orgCharAttributesValue, "")); + } + } + writer.startElement(rangeType.tag, rangeAttributes); + { + writer.endElement(rangeType.tag); + } + } + + private static void buildUngroupedRange( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + ArrayList range, + Range rangeType) + throws SAXException { + AttributesImpl rangeAttributes = + getReservedAttributes(ucdVersion, attributeResolver, range); + writer.startElement(rangeType.tag, rangeAttributes); + { + writer.endElement(rangeType.tag); + } + } + + private static boolean isWritableCodepoint( + int codepoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { + return outputRange == UCDXMLOUTPUTRANGE.ALL + || (outputRange == UCDXMLOUTPUTRANGE.UNIHAN + && attributeResolver.isUnihanAttributeRange(codepoint)) + || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN + && !attributeResolver.isUnifiedIdeograph(codepoint)); + } + + private static Range getRangeType(AttributeResolver attributeResolver, int codepoint) { + String NChar = attributeResolver.getNChar(codepoint); + UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(codepoint); + + if (attributeResolver.isUnihanAttributeRange(codepoint)) { + return Range.CJKUNIFIEDIDEOGRAPH; + } + if (gc.equals(UcdPropertyValues.General_Category_Values.Surrogate)) { + return Range.SURROGATE; + } + if (gc.equals(UcdPropertyValues.General_Category_Values.Private_Use)) { + return Range.CHARACTER; + } + if (NChar.equals(UcdPropertyValues.Binary.Yes.getShortName())) { + return Range.NONCHARACTER; + } + return Range.RESERVED; + } + + private static int getLastCodepointInGroup( + AttributeResolver attributeResolver, int lowCodepoint, int highCodepoint) { + String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodepoint); + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, codepoint))) { + return codepoint - 1; + } + if (codepoint == 0x20 - 1 // put the C0 controls in their own group + || codepoint == 0xa0 - 1 // put the C0 controls in their own group + || codepoint == 0x1160 - 1 // split the jamos into three groups + || codepoint == 0x11a8 - 1 // split the jamos into three groups + || codepoint == 0x1f1e6 - 1 // put the regional indicators in their own group + ) { + return codepoint; + } + } + return highCodepoint; + } + + private static AttributesImpl getAttributes( + VersionInfo version, + AttributeResolver attributeResolver, + int codepoint, + UCDXMLOUTPUTRANGE outputRange) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) < 0)) { + String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + boolean isAttributeIncluded = + getIsAttributeIncluded( + attrValue, + attributeResolver.isUnihanAttributeRange(codepoint), + propDetail, + prop, + outputRange); + if (isAttributeIncluded) { + String propName = prop.getShortName(); + if (propName.startsWith("cjk")) { + propName = propName.substring(2); + } + attributes.addAttribute(NAMESPACE, propName, propName, "CDATA", attrValue); + } + } + } + return attributes; + } + + private static AttributesImpl getGroupAttributes( + VersionInfo version, + AttributeResolver attributeResolver, + int lowCodepoint, + int highCodepoint, + UCDXMLOUTPUTRANGE outputRange) { + AttributesImpl attributes = new AttributesImpl(); + + for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) < 0)) { + int totalCount = 0; + Map counters = new LinkedHashMap<>(); + + for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { + if (!attributeResolver.isUnassignedCodepoint(codepoint)) { + String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + int currentCount = + (counters.get(attrValue) == null) ? 0 : counters.get(attrValue); + currentCount++; + totalCount++; + counters.put(attrValue, currentCount); + } + } + int max = Integer.MIN_VALUE; + String bestAttrValue = null; + for (String attrValue : counters.keySet()) { + int thisCount = counters.get(attrValue); + if (thisCount > max) { + max = thisCount; + bestAttrValue = attrValue; + } + } + switch (prop) { + case Decomposition_Mapping: + case Simple_Uppercase_Mapping: + case Simple_Lowercase_Mapping: + case Simple_Titlecase_Mapping: + case Uppercase_Mapping: + case Lowercase_Mapping: + case Titlecase_Mapping: + case Simple_Case_Folding: + case Case_Folding: + if (bestAttrValue != null) { + bestAttrValue = "#"; + } + } + if (max > 0.2 * totalCount && max > 1) { + boolean isAttributeIncluded = + getIsAttributeIncluded( + bestAttrValue, + attributeResolver.isUnihanAttributeRange(lowCodepoint), + propDetail, + prop, + outputRange); + if (isAttributeIncluded) { + String propName = prop.getShortName(); + if (propName.startsWith("cjk")) { + propName = propName.substring(2); + } + attributes.addAttribute( + NAMESPACE, propName, propName, "CDATA", bestAttrValue); + } + } + } + } + return attributes; + } + + private static boolean getIsAttributeIncluded( + String attrValue, + boolean isUnihanAttributeRange, + UcdPropertyDetail propDetail, + UcdProperty prop, + UCDXMLOUTPUTRANGE outputRange) { + if (attrValue == null) { + return false; + } + if (isUnihanAttributeRange) { + if (outputRange == UCDXMLOUTPUTRANGE.UNIHAN) { + if (prop.equals(UcdProperty.Numeric_Type) && !attrValue.equals("None")) { + return true; + } + if (prop.equals(UcdProperty.Numeric_Value) && !attrValue.equals("NaN")) { + return true; + } + return propDetail.isCJKAttribute() + && (propDetail.isCJKShowIfEmpty() || !attrValue.isEmpty()); + } + if (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN && propDetail.isCJKAttribute()) { + return false; + } + if (propDetail.isCJKShowIfEmpty()) { + return true; + } + } + if (propDetail.isBaseAttribute()) { + return true; + } + return !attrValue.isEmpty(); + } + + private static AttributesImpl getReservedAttributes( + VersionInfo version, AttributeResolver attributeResolver, ArrayList range) { + AttributesImpl attributes = new AttributesImpl(); + + if (range.size() == 1) { + attributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(range.get(0))); + } else { + attributes.addAttribute( + NAMESPACE, + "first-cp", + "first-cp", + "CDATA", + attributeResolver.getHexString(range.get(0))); + attributes.addAttribute( + NAMESPACE, + "last-cp", + "last-cp", + "CDATA", + attributeResolver.getHexString(range.get(range.size() - 1))); + } + for (UcdPropertyDetail propDetail : UcdPropertyDetail.baseValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) <= 0)) { + String attrValue = + attributeResolver.getAttributeValue( + propDetail.getUcdProperty(), range.get(0)); + + attributes.addAttribute( + NAMESPACE, prop.getShortName(), prop.getShortName(), "CDATA", attrValue); + } + } + return attributes; + } + + private static String getVersionString(VersionInfo version, int maxDigits) { + if (maxDigits >= 1 && maxDigits <= 4) { + int[] digits = + new int[] { + version.getMajor(), + version.getMinor(), + version.getMilli(), + version.getMicro() + }; + StringBuilder verStr = new StringBuilder(7); + verStr.append(digits[0]); + for (int i = 1; i < maxDigits; ++i) { + verStr.append("."); + verStr.append(digits[i]); + } + return verStr.toString(); + } else { + throw new IllegalArgumentException("Invalid maxDigits range"); + } + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java new file mode 100644 index 0000000000..5cd2df3af4 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -0,0 +1,482 @@ +package org.unicode.xml; + +import com.ibm.icu.impl.UnicodeMap; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.*; +import java.util.Map.Entry; +import org.unicode.cldr.util.XMLFileReader; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.text.utility.Utility; +import org.xml.sax.*; + +public class XMLProperties { + + enum XmlLeaf { + // Leaf + BLOCK, + BLOCKS, + CHAR, + CJK_RADICAL, + CJK_RADICALS, + DESCRIPTION, + DO_NOT_EMIT, + EMOJI_SOURCE, + EMOJI_SOURCES, + GROUP, + INSTEAD, + NAME_ALIAS, + NAMED_SEQUENCE, + NAMED_SEQUENCES, + NONCHARACTER, + NORMALIZATION_CORRECTION, + NORMALIZATION_CORRECTIONS, + PROVISIONAL_NAMED_SEQUENCES, + REPERTOIRE, + RESERVED, + STANDARDIZED_VARIANT, + STANDARDIZED_VARIANTS, + SURROGATE, + UCD; + static final XmlLeaf GREATEST_LEAF = NAME_ALIAS; + static final XmlLeaf GREATEST_BOTH = CHAR; + + static XmlLeaf forString(String source) { + try { + return XmlLeaf.valueOf(source.toUpperCase().replace('-', '_')); + } catch (final Exception e) { + return null; + } + } + } + + static class IntRange { + int start; + int end; + } + + Map> property2data = + new EnumMap>(UcdProperty.class); + + { + for (final UcdProperty prop : UcdProperty.values()) { + property2data.put(prop, new UnicodeMap()); + } + } + + Set leavesNotHandled = new LinkedHashSet(); + + public XMLProperties(File ucdxmlFile) { + readFile(ucdxmlFile); + + for (final UcdProperty prop : property2data.keySet()) { + final UnicodeMap map = property2data.get(prop); + map.freeze(); + } + } + + public void readFile(File ucdxmlFile) { + try { + System.out.println("Reading: " + ucdxmlFile.toString()); + final FileInputStream fis = new FileInputStream(ucdxmlFile); + final XMLReader xmlReader = XMLFileReader.createXMLReader(false); + xmlReader.setErrorHandler(new MyErrorHandler()); + xmlReader.setContentHandler(new MyContentHandler()); + final InputSource is = new InputSource(fis); + is.setSystemId(ucdxmlFile.toString()); + xmlReader.parse(is); + fis.close(); + } catch (final IOException | SAXException e) { + System.out.println("\t" + "Can't read " + ucdxmlFile); + System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); + } + } + + class MyContentHandler implements ContentHandler { + IntRange cp = new IntRange(); + HashMap attributes = new HashMap(); + HashMap groupAttributes = new HashMap(); + private final List lastElements = new ArrayList(); + + public MyContentHandler() {} + + @Override + public void characters(char[] arg0, int arg1, int arg2) throws SAXException { + final String chars = String.valueOf(arg0, arg1, arg2).trim(); + if (!chars.trim().isEmpty() + && lastElements.get(lastElements.size() - 1) != XmlLeaf.DESCRIPTION) { + throw new IllegalArgumentException("Should have no element content"); + } + } + + @Override + public void endElement(String arg0, String arg1, String arg2) throws SAXException { + try { + if (lastElements.isEmpty()) { + System.out.println( + "endElement: can't remove last element. Args: " + + arg0 + + ", " + + arg1 + + ", " + + arg2); + } else { + final XmlLeaf removed = lastElements.remove(lastElements.size() - 1); + } + } catch (ArrayIndexOutOfBoundsException e) { + throw new IllegalArgumentException( + "endElement: can't remove last element. Args: " + + arg0 + + ", " + + arg1 + + ", " + + arg2, + e); + } + } + + @Override + public void endDocument() throws SAXException {} + + @Override + public void endPrefixMapping(String arg0) throws SAXException {} + + @Override + public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {} + + @Override + public void processingInstruction(String arg0, String arg1) throws SAXException {} + + @Override + public void setDocumentLocator(Locator arg0) {} + + @Override + public void skippedEntity(String arg0) throws SAXException {} + + @Override + public void startDocument() throws SAXException {} + + @Override + public void startPrefixMapping(String arg0, String arg1) throws SAXException {} + + @Override + public void startElement( + String namespaceURI, String localName, String qName, Attributes atts) { + try { + final XmlLeaf xmlLeaf = XmlLeaf.forString(qName); + if (xmlLeaf == null) { + throw new IllegalArgumentException(qName); + } + lastElements.add(xmlLeaf); + // System.out.println("Added:\t" + lastElements); + + if (xmlLeaf == XmlLeaf.GROUP) { + groupAttributes.clear(); + addAttributes(atts, groupAttributes); + return; + } + attributes.clear(); + attributes.putAll(groupAttributes); + addAttributes(atts, attributes); + String cps; + switch (xmlLeaf) { + case CHAR: + case RESERVED: + case SURROGATE: + case NONCHARACTER: + parseCp(attributes); + for (final Entry entry : attributes.entrySet()) { + doAttributes(entry.getKey(), entry.getValue()); + } + if (xmlLeaf == XmlLeaf.NONCHARACTER) { + property2data + .get(UcdProperty.Noncharacter_Code_Point) + .putAll(cp.start, cp.end, "Yes"); + } + break; + case BLOCK: + parseCp(attributes); + property2data + .get(UcdProperty.Block) + .putAll(cp.start, cp.end, attributes.get("name")); + break; + case NAMED_SEQUENCE: + cps = Utility.fromHex(attributes.get("cps")); + property2data + .get(UcdProperty.Named_Sequences) + .put(cps, attributes.get("name")); + break; + case CJK_RADICAL: + final String number = attributes.get("number"); + setProp( + Utility.fromHex(attributes.get("radical")), + UcdProperty.CJK_Radical, + number); + setProp( + Utility.fromHex(attributes.get("ideograph")), + UcdProperty.CJK_Radical, + number); + break; + case EMOJI_SOURCE: + cps = Utility.fromHex(attributes.get("unicode")); + setProp(cps, UcdProperty.Emoji_DCM, attributes.get("docomo")); + setProp(cps, UcdProperty.Emoji_KDDI, attributes.get("kddi")); + setProp(cps, UcdProperty.Emoji_SB, attributes.get("softbank")); + break; + case REPERTOIRE: + case BLOCKS: + case CJK_RADICALS: + case EMOJI_SOURCES: + case NAMED_SEQUENCES: + case PROVISIONAL_NAMED_SEQUENCES: + case NORMALIZATION_CORRECTIONS: + case STANDARDIZED_VARIANTS: + case DESCRIPTION: + case DO_NOT_EMIT: + // non-informational nodes, skip + if (atts.getLength() != 0) { + throw new IllegalArgumentException("Has attributes"); + } + break; + case UCD: + if (atts.getLength() != 0) { + throw new IllegalArgumentException( + "Has wrong number of attributes: " + attributes.entrySet()); + } + break; + case NAME_ALIAS: + final String alias = + attributes.get("alias") + "(" + attributes.get("type") + ")"; + appendProp(cp.start, UcdProperty.Name_Alias, alias); + break; + case STANDARDIZED_VARIANT: + { + String desc = attributes.get("desc"); + final String when = attributes.get("when"); + if (!when.isEmpty()) { + desc = desc + "(" + when + ")"; + } + cps = Utility.fromHex(attributes.get("cps")); + appendProp(cps, UcdProperty.Standardized_Variant, desc); + break; + } + case NORMALIZATION_CORRECTION: + final String correction = + "old: " + + attributes.get("old") + + " new: " + + attributes.get("new") + + " version: " + + attributes.get("version"); + cps = Utility.fromHex(attributes.get("cp")); + appendProp(cps, UcdProperty.NC_Original, correction); + break; + case INSTEAD: + final String instead = + "use: " + + attributes.get("use") + + " because: " + + attributes.get("because"); + cps = attributes.get("of"); + appendProp(cps, UcdProperty.Do_Not_Emit_Preferred, instead); + break; + case GROUP: + break; // handled above. Leaving case for clarity + default: + leavesNotHandled.add(qName); + break; + } + } catch (final Exception e) { + System.out.println( + "Exception: " + + qName + + "\t" + + e.getClass().getName() + + "\t" + + e.getMessage()); + } + } + + public void addAttributes(Attributes atts, Map map) { + for (int i = 0; i < atts.getLength(); ++i) { + map.put(atts.getQName(i), atts.getValue(i)); + } + } + + public void setProp(String cps, UcdProperty ucdProperty, String docomo) { + if (docomo != null) { + property2data.get(ucdProperty).put(cps, docomo); + } + } + + public void setProp(int cps, UcdProperty ucdProperty, String docomo) { + if (docomo != null) { + property2data.get(ucdProperty).put(cps, docomo); + } + } + + public void appendProp(int cps, UcdProperty ucdProperty, String docomo) { + final UnicodeMap unicodeMap = property2data.get(ucdProperty); + final String former = unicodeMap.get(cps); + unicodeMap.put(cps, former == null ? docomo : former + "; " + docomo); + } + + public void appendProp(String cps, UcdProperty ucdProperty, String docomo) { + final UnicodeMap unicodeMap = property2data.get(ucdProperty); + final String former = unicodeMap.get(cps); + unicodeMap.put(cps, former == null ? docomo : former + "; " + docomo); + } + + public void parseCp(HashMap attributes2) { + final String cpString = attributes2.get("cp"); + if (cpString != null) { + cp.start = cp.end = Integer.parseInt(cpString, 16); + } else { + cp.start = Integer.parseInt(attributes2.get("first-cp"), 16); + cp.end = Integer.parseInt(attributes2.get("last-cp"), 16); + } + } + + public UnicodeMap doAttributes(String key, String value) { + UcdProperty prop = UcdProperty.forString(key); + // if (prop == UcdProperty.Deprecated && cp.start > 0xE0000 && cp.start < + // 0xE00FF) { + // System.out.println(Utility.hex(cp.start) + "," + Utility.hex(cp.end) + + // "\t" + key + "\t" + value); + // } + if (prop == null) { + if (key.endsWith("cp")) { + if (key.equals("cp") || key.equals("last-cp") || key.equals("first-cp")) { + return null; + } + } else if (key.equals("InSC")) { + prop = UcdProperty.Indic_Syllabic_Category; + } else if (key.equals("InMC")) { + prop = UcdProperty.Indic_Syllabic_Category; + } + if (prop == null) { + return null; + } + } + final UnicodeMap data = property2data.get(prop); + if (data == null) { + System.out.println("can't get data for " + key); + return null; + } + data.putAll(cp.start, cp.end, value.intern()); + return data; + } + } + + static class MyErrorHandler implements ErrorHandler { + @Override + public void error(SAXParseException exception) throws SAXException { + // System.out.println("\nerror: " + XMLFileReader.showSAX(exception)); + throw exception; + } + + @Override + public void fatalError(SAXParseException exception) throws SAXException { + // System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception)); + throw exception; + } + + @Override + public void warning(SAXParseException exception) throws SAXException { + // System.out.println("\nwarning: " + XMLFileReader.showSAX(exception)); + throw exception; + } + } + + public UnicodeMap getMap(UcdProperty prop) { + return property2data.get(prop); + } + + public Set getLeavesNotHandled() { + return leavesNotHandled; + } + + static String show(String ival) { + if (ival == null) { + return "null"; + } else if (ival.isEmpty()) { + return ""; + } else if (ival.codePointAt(0) < 0x20) { + return "\\u{" + Utility.hex(ival, 4) + "}"; + } + return "«" + ival + "»"; + } + + // private static final String NO_VALUE = + // IndexUnicodeProperties.DefaultValueType.NO_VALUE.toString(); + // private static final String NAN = IndexUnicodeProperties.DefaultValueType.NaN.toString(); + + static final boolean HACK_XML_DEFAULTS = false; + + public static String getXmlResolved(UcdProperty property, int codePoint, String propertyValue) { + if (property == UcdProperty.Name) { + int debug = 0; + } + switch (property.getType()) { + case Binary: + if (HACK_XML_DEFAULTS) { + if (propertyValue == null) { + propertyValue = "No"; + } else { + propertyValue = + IndexUnicodeProperties.normalizeValue(property, propertyValue); + } + break; + } + // $FALL-THROUGH$ + case Enumerated: + case Catalog: + if (propertyValue != null) { + propertyValue = IndexUnicodeProperties.normalizeValue(property, propertyValue); + } + break; + case Numeric: + // if (HACK_XML_DEFAULTS) { + // if (propertyValue == null || propertyValue.isEmpty()) { + // propertyValue = "NaN"; + // } + // } + switch (property) { + case kOtherNumeric: + case kPrimaryNumeric: + case kAccountingNumeric: + if (propertyValue == null || propertyValue.isEmpty()) { + propertyValue = "NaN"; + } + break; + } + break; + case Miscellaneous: + if (propertyValue != null) { + switch (property) { + case Script_Extensions: + propertyValue = + IndexUnicodeProperties.normalizeValue(property, propertyValue); + break; + // case Name: + // break; + default: + propertyValue = propertyValue.replace("#", Utility.hex(codePoint)); + } + } + break; + case String: + if (propertyValue != null) { + propertyValue = propertyValue.replace("#", Utility.hex(codePoint)); + propertyValue = Utility.fromHex(propertyValue); + } + break; + default: + break; + } + return propertyValue; + // return propertyValue == null ? "" : propertyValue; + } +} diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt index 7d4ce84e71..0f9cbda3dc 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt @@ -67,6 +67,7 @@ CJKR ; CJK_Radical EDCM ; Emoji_DCM EKDDI ; Emoji_KDDI ESB ; Emoji_SB +EVS ; emoji_variation_sequence NS ; Named_Sequences NSP ; Named_Sequences_Prov SV ; Standardized_Variant @@ -160,6 +161,9 @@ cjkJoyoKanji ; kJoyoKanji cjkKoreanEducationHanja ; kKoreanEducationHanja cjkKoreanName ; kKoreanName cjkTGH ; kTGH +ncCorrected ; NC_Corrected +ncOriginal ; NC_Original +ncVersion ; NC_Version # 13.0 cjkSpoofingVariant ; kSpoofingVariant cjkTGHZ2013 ; kTGHZ2013 @@ -187,4 +191,4 @@ kReading ; kReading kEH_Func ; kEH_Func kEH_FVal ; kEH_FVal -kEH_UniK ; kEH_UniK \ No newline at end of file +kEH_UniK ; kEH_UniK diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt index a9b3e9f12d..1cf4ccd72e 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt @@ -44,9 +44,11 @@ $codePoint0 = ($codePoints)? # Main data Bidi_Mirroring_Glyph ; SINGLE_VALUED ; $codePoint +Bidi_Paired_Bracket ; SINGLE_VALUED ; $codePoint Simple_Lowercase_Mapping ; SINGLE_VALUED ; $codePoint Simple_Titlecase_Mapping ; SINGLE_VALUED ; $codePoint Simple_Uppercase_Mapping ; SINGLE_VALUED ; $codePoint +Equivalent_Unified_Ideograph; SINGLE_VALUED ; $codePoint NFKC_Casefold ; SINGLE_VALUED ; $codePoint0 NFKC_Simple_Casefold ; SINGLE_VALUED ; $codePoint0 @@ -142,7 +144,7 @@ kHanYu ; MULTI_VALUED ; [1-8][0-9]{4}\.[0-3 kIRGHanyuDaZidian ; SINGLE_VALUED ; [1-8][0-9]{4}\.[0-3][0-9][01] kCNS1992 ; SINGLE_VALUED ; [1-9]-[0-9A-F]{4} kTotalStrokes ; ORDERED ; [1-9][0-9]{0,2} -kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2} +kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2} kRSJapanese ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} kRSKanWa ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} kRSKangXi ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} @@ -170,11 +172,11 @@ kHanyuPinlu ; MULTI_VALUED ; [a-z\x{308}]+[1-5]\ kCantonese ; MULTI_VALUED ; [a-z]{1,6}[1-6] kTang ; MULTI_VALUED ; \*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+ -kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})? -kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4}) +kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})? +kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4}) kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2} -kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})* -kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3} +kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})* +kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3} kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} @@ -199,28 +201,37 @@ kZhuangNumeric ; MULTI_VALUED ; .* kFanqie ; MULTI_VALUED ; .* kZhuang ; MULTI_VALUED ; .* +kSrc_NushuDuben ; SINGLE_VALUED ; [0-9]+\.[0-9]+ +kReading ; SINGLE_VALUED ; [a-z]{1,6}[1-6]+ +kRSTUnicode ; SINGLE_VALUED ; [0-9]+\.[0-9]+ +kTGT_MergedSrc ; SINGLE_VALUED ; L2008-[0-9A-F]{4,5}(-[0-9]{4,5})? + +NC_Original ; SINGLE_VALUED ; [0-9A-F]{4,5} +NC_Corrected ; SINGLE_VALUED ; [0-9A-F]{4,5} +NC_Version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9] + # ============================= # Catalog/Enum/Binary Properties # All not listed are SINGLE_VALUED ; null # ============================= -Script_Extensions ; MULTI_VALUED ; -Standardized_Variant ; MULTI_VALUED ; .* +Script_Extensions ; MULTI_VALUED ; +Standardized_Variant ; MULTI_VALUED ; .* -Idn_Status ; SINGLE_VALUED ; -Idn_Mapping ; SINGLE_VALUED ; $codePoints -Idn_2008 ; SINGLE_VALUED ; +Idn_Status ; SINGLE_VALUED ; +Idn_Mapping ; SINGLE_VALUED ; $codePoints +Idn_2008 ; SINGLE_VALUED ; -Identifier_Status ; SINGLE_VALUED ; -Identifier_Type ; MULTI_VALUED ; +Identifier_Status ; SINGLE_VALUED ; +Identifier_Type ; MULTI_VALUED ; -Confusable_SL ; SINGLE_VALUED ; $codePoints -Confusable_SA ; SINGLE_VALUED ; $codePoints -Confusable_ML ; SINGLE_VALUED ; $codePoints -Confusable_MA ; SINGLE_VALUED ; $codePoints +Confusable_SL ; SINGLE_VALUED ; $codePoints +Confusable_SA ; SINGLE_VALUED ; $codePoints +Confusable_ML ; SINGLE_VALUED ; $codePoints +Confusable_MA ; SINGLE_VALUED ; $codePoints -#Emoji ; SINGLE_VALUED ; -#Emoji_Presentation ; SINGLE_VALUED ; -#Emoji_Modifier ; SINGLE_VALUED ; -#Emoji_Modifier_Base ; SINGLE_VALUED ; +#Emoji ; SINGLE_VALUED ; +#Emoji_Presentation ; SINGLE_VALUED ; +#Emoji_Modifier ; SINGLE_VALUED ; +#Emoji_Modifier_Base ; SINGLE_VALUED ; diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt index f7c9da838c..5ff7cbdf58 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt @@ -36,6 +36,8 @@ FileType ; Unihan_OtherMappings ; PropertyValue FileType ; Unihan_RadicalStrokeCounts ; PropertyValue FileType ; Unihan_Readings ; PropertyValue FileType ; Unihan_Variants ; PropertyValue +FileType ; NushuSources ; PropertyValue +FileType ; TangutSources ; PropertyValue # NameAliases File Type # Contains a multivalued property, where successive values are not in the same line, but are divided out on successive lines with the same code point @@ -43,6 +45,7 @@ FileType ; Unihan_Variants ; PropertyValue FileType ; NameAliases ; NameAliases FileType ; NameAliasesProv ; NameAliases FileType ; StandardizedVariants ; StandardizedVariants +FileType ; emoji-variation-sequences ; StandardizedVariants # CJKRadicals File Type @@ -320,6 +323,15 @@ Unihan_Variants ; kSpoofingVariant Unihan_Variants ; kTraditionalVariant Unihan_Variants ; kZVariant +NushuSources ; kSrc_NushuDuben +NushuSources ; kReading +TangutSources ; kRSTUnicode +TangutSources ; kTGT_MergedSrc + +NormalizationCorrections ; NC_Original +NormalizationCorrections ; NC_Corrected +NormalizationCorrections ; NC_Version + # Properties removed from Unihan before 5.1. # Point to a nonexistent file so that we don’t try to read them from the most recent monolithic # Unihan, as we would then get confused by the other (still-extant) properties in that file. @@ -438,6 +450,7 @@ EmojiSources ; Emoji_SB ; 3 NamedSequences ; Named_Sequences NamedSequencesProv ; Named_Sequences_Prov StandardizedVariants ; Standardized_Variant +emoji-variation-sequences ; emoji-variation-sequence DoNotEmit ; Do_Not_Emit_Preferred ; 1 DoNotEmit ; Do_Not_Emit_Type ; 2 @@ -488,15 +501,6 @@ emoji/*/emoji-zwj-sequences; RGI_Emoji_Zwj_Sequence #emoji/*/emoji-test ; Emoji_Short_Name - -FileType ; TangutSources ; PropertyValue -TangutSources ; kTGT_MergedSrc -TangutSources ; kRSTUnicode - -FileType ; NushuSources ; PropertyValue -NushuSources ; kSrc_NushuDuben -NushuSources ; kReading - FileType ; Unikemet ; PropertyValue Unikemet ; kEH_Cat Unikemet ; kEH_Core diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml new file mode 100644 index 0000000000..617113bf28 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Bidi_C { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml new file mode 100644 index 0000000000..c1380221b2 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Bidi_M { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml new file mode 100644 index 0000000000..7c78734594 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml @@ -0,0 +1,20 @@ + + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml new file mode 100644 index 0000000000..8340250dc3 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml @@ -0,0 +1,9 @@ + + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml new file mode 100644 index 0000000000..a7de623873 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml @@ -0,0 +1,21 @@ + + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml new file mode 100644 index 0000000000..ddddc27a4e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml @@ -0,0 +1,42 @@ + + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml new file mode 100644 index 0000000000..568f5e270c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml new file mode 100644 index 0000000000..4cbf1d0f0f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Join_C { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml new file mode 100644 index 0000000000..c2b53b2fef --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml @@ -0,0 +1,10 @@ + + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml new file mode 100644 index 0000000000..8919bba32e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml new file mode 100644 index 0000000000..a6ff2d0926 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml @@ -0,0 +1,8 @@ + + + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml new file mode 100644 index 0000000000..21e52208a5 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml @@ -0,0 +1,18 @@ + + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml new file mode 100644 index 0000000000..ba4c042f8d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml @@ -0,0 +1,347 @@ + + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml new file mode 100644 index 0000000000..8a1722f229 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml @@ -0,0 +1,23 @@ + + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml new file mode 100644 index 0000000000..d3e70a6abe --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml @@ -0,0 +1,17 @@ + + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml new file mode 100644 index 0000000000..ecd721a634 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml @@ -0,0 +1,344 @@ + + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml new file mode 100644 index 0000000000..1d9b2beb8b --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml new file mode 100644 index 0000000000..d4431070d5 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml new file mode 100644 index 0000000000..fae36d68db --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml @@ -0,0 +1,4 @@ + + + boolean = "Y" | "N" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml new file mode 100644 index 0000000000..abe4ffe9a0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml @@ -0,0 +1,58 @@ + + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml new file mode 100644 index 0000000000..3924ed3e9d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml new file mode 100644 index 0000000000..183c9bf3f1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml new file mode 100644 index 0000000000..8708699bee --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml new file mode 100644 index 0000000000..c1296b7b94 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml new file mode 100644 index 0000000000..df4b97e640 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml @@ -0,0 +1,32 @@ + + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml new file mode 100644 index 0000000000..503f059999 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml @@ -0,0 +1,14 @@ + + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml new file mode 100644 index 0000000000..8226509d71 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml new file mode 100644 index 0000000000..45c49ed2c1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml new file mode 100644 index 0000000000..08222c4f01 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= attribute cjkEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml new file mode 100644 index 0000000000..49f9c3917d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml @@ -0,0 +1,6 @@ + + + code-point-attributes &= attribute cjkIRG_TSource + { xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4} +| TU-[023][0-9A-F]{4}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml new file mode 100644 index 0000000000..96ce4abcf6 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml new file mode 100644 index 0000000000..c26367d970 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml @@ -0,0 +1,5 @@ + + + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml new file mode 100644 index 0000000000..c3cda88df1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml @@ -0,0 +1,9 @@ + + + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml new file mode 100644 index 0000000000..833a7d1e06 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml new file mode 100644 index 0000000000..97bb063e7d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml @@ -0,0 +1,6 @@ + + + + ucd.content &= + element description { text }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml new file mode 100644 index 0000000000..5381491e7f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml @@ -0,0 +1,22 @@ + + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml new file mode 100644 index 0000000000..d51bf24414 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml new file mode 100644 index 0000000000..96d122953e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml @@ -0,0 +1,11 @@ + + + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml new file mode 100644 index 0000000000..7ce510adc0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml @@ -0,0 +1,68 @@ + + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml new file mode 100644 index 0000000000..36cd1f7749 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml @@ -0,0 +1,12 @@ + + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml new file mode 100644 index 0000000000..11f3b0dd97 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml @@ -0,0 +1,8 @@ + + + + group = + element group { + code-point-attributes, + code-point* } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml new file mode 100644 index 0000000000..385cd466ab --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml new file mode 100644 index 0000000000..0ab95a27f0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml @@ -0,0 +1,26 @@ + + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml new file mode 100644 index 0000000000..0c758e3425 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml @@ -0,0 +1,23 @@ + + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml new file mode 100644 index 0000000000..f19b593171 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute isc { text }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml new file mode 100644 index 0000000000..9a6820c7b4 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml @@ -0,0 +1,5 @@ + + + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml new file mode 100644 index 0000000000..184fcca14d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml @@ -0,0 +1,53 @@ + + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml new file mode 100644 index 0000000000..ee1f36cac0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml @@ -0,0 +1,24 @@ + + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml new file mode 100644 index 0000000000..5dafe8c223 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml new file mode 100644 index 0000000000..4c4644c311 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml @@ -0,0 +1,13 @@ + + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml new file mode 100644 index 0000000000..592de98c37 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml new file mode 100644 index 0000000000..2859ea29d9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml @@ -0,0 +1,15 @@ + + + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml new file mode 100644 index 0000000000..e75306a26f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml @@ -0,0 +1,5 @@ + + + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml new file mode 100644 index 0000000000..7231a8c261 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml @@ -0,0 +1,11 @@ + + + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml new file mode 100644 index 0000000000..24230aee1a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml new file mode 100644 index 0000000000..baa00a73c7 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml new file mode 100644 index 0000000000..224c2287ea --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml @@ -0,0 +1,31 @@ + + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml new file mode 100644 index 0000000000..0cfc86e40a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml @@ -0,0 +1,6 @@ + + + + ucd.content &= + element repertoire { (code-point | group) + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml new file mode 100644 index 0000000000..cdfd1ad884 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml @@ -0,0 +1,23 @@ + + + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml new file mode 100644 index 0000000000..b22243aaf8 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml @@ -0,0 +1,49 @@ + + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml new file mode 100644 index 0000000000..e2acb669c2 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml new file mode 100644 index 0000000000..a415a1152a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml new file mode 100644 index 0000000000..ba0e2262fb --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml @@ -0,0 +1,6 @@ + + + + start = + element ucd { ucd.content } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index.xml b/unicodetools/src/main/resources/org/unicode/uax42/index.xml new file mode 100644 index 0000000000..6b4733a2b0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index.xml @@ -0,0 +1,1353 @@ + + +
    + + Unicode Character Database in XML + + + + + 2024 + + + + Wilcock + John + + + + + + New value for the age attribute: 16.0. + + New values for the blk attribute: Egyptian_Hieroglyphs_Ext_A, + Garay, Gurung_Khema, Kirat_Rai, Myanmar_Ext_C, + Ol_Onal, Sunuwar, Symbols_for_Legacy_Computing_Sup, + Todhri, Tulu_Tigalari. + + New values for the script attribute: Gara, Gukh, + Krai, Onao, Sunu, Todr, Tutg. + + New value for the jg attribute: Kashmiri_Yeh. + New value for the InSC attribute: Reordering_Killer. + + New attributes: MCM, kFanqie, kZhuang. + + Modified patterns for the cjk-radical/@number, kRSUnicode and + kIRG_GSource + attributes. + + Added the do-not-emit element. + + + + Revision 35 being a proposed update, only changes between revisions 34 and 36 are + noted here. + + + + New value for the age attribute: 15.1. + + New value for the blk attribute: CJK_Ext_I. + + New values for the lb attribute: AK, AP, + AS, VF, VI. + + Modified values for the number, radical attributes of the + cjk-radical + element. + + Changed single value into list for the nv code point attribute. + + New code point attributes: ID_Compat_Math_Continue, + ID_Compat_Math_Start, IDSU, NFKC_SCF, InCB. + + Modified patterns for the kBigFive, kIRG_GSource, + kMorohashi, kRSUnicode attributes. + + Changed single values into lists for the kMorohashi, kPrimaryNumeric + Unihan attributes. + + New Unihan attributes: kJapanese, kMojiJoho, + kSMSZD2003Index, kSMSZD2003Readings, kVietnameseNumeric, + kZhuangNumeric. + + + + Revision 33 being a proposed update, only changes between revisions 32 and 34 are + noted here. + + + + New value for the age attribute: 15.0. + + New values for the blk attribute: Arabic_Ext_C, CJK_Ext_H, + Cyrillic_Ext_D, Devanagari_Ext_A, Kaktovik_Numerals, Kawi, + Nag_Mundari. + + New values for the script attribute: Kawi, Nagm. + + New Unihan attribute: kAlternateTotalStrokes. + + Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_TSource, kSemanticVariant, kSpecializedSemanticVariant, + kZVariant + attributes. + + + + Revision 31 being a proposed update, only changes between revisions 30 and 32 are + noted here. + + + + New value for the age attribute: 14.0. + + New values for the blk attribute: Arabic_Ext_B, + Cypro_Minoan, Ethiopic_Ext_B, Kana_Ext_B, + Latin_Ext_F, Latin_Ext_G, Old_Uyghur, Tangsa, + Toto, UCAS_Ext_A, Vithkuqi, Znamenny_Music. + + New values for the script attribute: Cpmn, Ougr, + Tnsa, Toto, Vith. + + New values for the jg attribute: Thin_Yeh, Vertical_Tail. + + New Unihan attribute: kStrange. + + Modified patterns for the kIRG_GSource, kIRG_MSource, + kIRG_VSource, kPhonetic, kSpoofingVariant attributes. + + Removal of the kWubi attribute, which has never been present in + released versions of the UCD. + + + + Revision 29 being a proposed update, only changes between revisions 28 and 30 are + noted here. + + + + New value for the age attribute: 13.0. + + New values for the blk attribute: Chorasmian, CJK_Ext_G, + Dives_Akuru, Khitan_Small_Script, Lisu_Sup, + Symbols_For_Legacy_Computing, Tangut_Sup, Yezidi. + + New values for the script attribute: Chrs, Diak, + Kits, Yezi. + + New value for the InPC attribute: Top_And_Bottom_And_Left. + + New Unihan attributes kSpoofingVariant, kUnihanCore2020, + kIRG_SSource, kIRG_UKSource, kTGHZ2013. + + New Emoji attributes Emoji, EPres, EMod, + EBase, EComp, ExtPict. + + Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, kKangXi, + kSemanticVariant, kSimplifiedVariant, + kSpecializedSemanticVariant, kTraditionalVariant attributes. + + + + Revision 27 being a proposed update, only changes between revisions 26 and 28 are + noted here. + + + + New value for the age attribute: 12.1. + + + + + + New value for the age attribute: 12.0. + + New values for the script attribute: Elym, Hmnp, + Nand, Wcho. + + New values for the blk attribute: + Egyptian_Hieroglyph_Format_Controls, Elymaic, Nandinagari, + Nyiakeng_Puachue_Hmong, Ottoman_Siyaq_Numbers, Small_Kana_Ext, + Symbols_And_Pictographs_Ext_A, Tamil_Sup, Wancho. + + Modified patterns for the kIRG_GSource, kIRG_KSource, + kIRG_TSource, kTaiwanTelegraph attributes. + + + + Revision 24 being a proposed update, only changes between revisions 23 and 25 are + noted here. + + + + New value for the age attribute: 11.0. + + New values for the blk attribute: Chess_Symbols, + Dogra, Georgian_Ext, Gunjala_Gondi, + Hanifi_Rohingya, Indic_Siyaq_Numbers, Makasar, + Mayan_Numerals, Medefaidrin, Old_Sogdian, Sogdian. + + New values for the script attribute: Dogr, Gong, + Maka, Medf, Rohg, Sogd, Sogo. + + New values for the jg attribute: Hanifi_Rohingya_Kinna_Ya, + Hanifi_Rohingya_Pa. + + New value for the wb attribute: WSegSpace. + + New values for the InSC attribute: Consonant_Initial_Postfixed. + + New attributes: EqUIdeo, kJinmeiyoKanji, kJoyoKanji, + kKoreanEducationHanja, kKoreanName, kTGH. + + Modified patterns for the kTGT_MergedSrc attribute. + + Modified patterns for the kIRG_GSource, kIRG_HSource and + kIRG_VSource + attributes. + + + + Revision 22 being a proposed update, only changes between revisions 21 and 23 are + noted here. + + + + New value for the age attribute: 10.0. + + New values for the blk attribute: CJK_Ext_F, Kana_Ext_A, + Masaram_Gondi, Nushu, Soyombo, Syriac_Sup, + Zanabazar_Square. + + New values for the sc attribute: Gonm, Nshu, + Soyo, Zanb. + + New values for the jg attribute: Malayalam_Nga, + Malayalam_Ja, Malayalam_Nya, Malayalam_Tta, Malayalam_Nna, + Malayalam_Nnna, Malayalam_Bha, Malayalam_Ra, + Malayalam_Lla, Malayalam_Llla, Malayalam_Ssa. + + New value for the InPC attribute: Bottom_And_Left. + + Modified patterns for the kIRG_GSource, kIRG_JSource, + kIRG_KSource + attributes. + + New code point attributes: vo, + RI + + New code point attributes for Nushu data: kSrc_NushuDuben and + kReading. + + + + Revision 20 being a proposed update, only changes between revisions 19 and 21 are + noted here. + + + + New value for the age attribute: 9.0. + + New values for the sc attribute: Adlm, Bhks, + Marc, Newa, Osge, Tang. + + New values for the blk attribute: Adlam, Bhaiksuki, + Cyrillic_Ext_C, Glagolitic_Sup, Ideographic_Symbols, + Marchen, Mongolian_Sup, Newa, Osage, + Tangut, Tangut_Components. + + New values for the gcb attribute: EB, EBG, EM, + GAZ, ZWJ. + + New values for the wb attribute: EB, EBG, EM, + GAZ, ZWJ. + + New values for the lb attribute: EB, EM, ZWJ. + + New values for the jg attribute: African_Feh, + African_Noon, African_Qaf. + + New code point attributes: PCM, kRSTUnicode and + kTGT_MergedSrc. + + Modified patterns for the kRSUnicode, kRSKangXi, + kMandarin, kIRG_JSource, kIRG_USource and kFennIndex + attributes. + + + + Revision 18 being a proposed update, only changes between revisions 17 and 19 are + noted here. + + + + New value for the age attribute: 8.0. + + New values for the sc attribute: Ahom, Hatr, + Hluw, Hung, Mult, Sgnw. + + New values for the blk attribute: Ahom, + Anatolian_Hieroglyphs, Cherokee_Sup, CJK_Ext_E, + Early_Dynastic_Cuneiform, Hatran, Multani, Old_Hungarian, + Sup_Symbols_And_Pictographs, Sutton_SignWriting. + + New values for the InSC attribute: Consonant_Killer, + Consonant_Prefixed, Consonant_With_Stacker, Syllable_Modifier. + + New code point attributes: InPC, kJa. + + New patterns for the kIRG_GSource attribute: GFC-, GGFZ-. + + Switched the reference to ISO 19757 from :2003 and :2003 Amd1 to :2008. + + + Revision 16 being a proposed update, only changes between revisions 15 and 17 are + noted here. + + + + New value for the age attribute: 7.0. + + New values for the jg attribute. + + New values for the sc attribute. + + New values for the blk attribute. + + New values for the InSC attribute. + + New values for the kIICore attribute. + + New values for the kIRG_GSource attribute. + + + + Revision 14 being a proposed update, only changes between revisions 13 and 15 are + noted here. + + + + New value for the age attribute: 6.3. + + New values DQ, HL, SQ for the WB attribute(forUnicode6.3). + + New code point attributes bpt and bpb (for Unicode 6.3). + + New values for the bc attribute: LRI, RLI, FSI, + PDI + (for Unicode 6.3). + + Updated the patterns for kHanyuPinlu and kTotalStrokes (for + Unicode6.3). + + Updated the patterns for kIRG_HSource and kIRG_HSource (for + Unicode6.2). + + Clarified that the child elements list-like elements are in no particular order. + + + Revision 12 being a proposed update, only changes between revisions 11 and 13 are + noted here. + + + + New value for the age attribute: 6.2. + + New value for the gcb, wb and lb attributes: + RI + (for Unicode 6.2). + + Updated the patterns for kIRG_GSource and kIRG_HSource (for + Unicode 6.2). + + + + Revision 10 being a proposed update, only changes between revisions 9 and 11 are + noted here. + + + + Clarified the default values. + Indicate that property values may change from one release to the next. + Introduced the blk attributes, for the Block property. + + Introduced the scx attribute, for the ScriptExtensions property. + + Introduced the name-alias element, for the Name_Alias property. + + New value for the age attribute: 6.1. + + New values for the script attribute: Cakm, Merc, + Mero, Plrd, Shrd, Sora, Takr. + + New values for the lb attribute: HL and CJ. + + New value for the jg attribute: Rohingya_Yeh. + + The value of the fc_nfkc attribute must now be either # or + one-or-more-code-points. + + For the nv attribute, the absence of a numeric value is now represented by + NaN + rather than by the empty string. + + The values of the ccc are now restricted to 0..254, instead of 0..255. + + Updated the patterns for kSemanticVariant, + kSpecializedSemanticVariant, kIRG_USource, and kMandarin. + + + + Revision 8 being a proposed update, only changes between revisions 7 and 9 are noted + here. + + + + New value for the age attribute: 6.0. + + New value for the jg attribute: + Teh_Marbuta_Goal + + New values for the script attribute: Batk, Brah, + Mand. + + Updated the patterns for kIRG_GSource, kIRG_HSource, + kIRG_JSource, kIRG_KSource, kIRG_MSource, + kIRG_TSource, kIRG_VSource. + + Added the InSC and InMC elements. + + Added the emoji-sources element. + + + + Revision 6 being a proposed update, only changes between revisions 5 and 7 are noted + here. + + + + Changed the type of block/@first-cp, block/@last-cp and + normalization-corrections/@cp + from text to + single-code-point + + Changed the type of named-sequence/@cps, + provisional-named-sequences/@cps, normalization-correction/@old and + normalization-correction/@new + from text to one-or-more-code-points. + + Changed the type of standardized-variants/@cps from text to + two-code-points. + + New values for the jg attribute: Farsi_Yeh and Nya. + + New value for the age attribute: 5.2. + + New values for the sc attribute: Lana, Tavt, + Avst, Egyp, Samr, Lisu, Bamu, Java, + Mtei, Armi, Sarb, Prti, Phli, Orkh, + Kthi. + + New value for the lb attribute: CP. + + New value for the sc attribute: Zinh. + + New code point attributes CI, Cased, CWCF, + CWCM, CWL, CWKCF, CWT, CWU, + NFKC_CF. + + New attributes kHanyuPinyin and kIRG_MSource. + + New element + cjk-radicals + + Updated the patterns for kIRG_GSource, kIRG_JSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, + kIRG_VSource, kHanyuPinlu, kMandarin, + kSemanticVariant, kSpecializedSemanticVariant, + kVietnamese, kZVariant. + + Point out that Relax NG schemas do not modify or augment the infoset, and that it ispossible + to convert mechanically our schema to other schema languages. + + + + Revision 4 being a proposed update, only changes between revisions 3 and 5 are noted + here. + + + + First approved version, for Unicode 5.1.0. + For optional elements which acts as collections, such as repertoire and + named-sequences, impose that there be at least one element in the collection. + + Remove the constraint that the value jg is limited when jt has + certainvalues; similarly for bmg / Bidi_M and for nv / + nt. + + Value NL added to the WB attribute (for Unicode 5.1). + + Value PP added to the GCB attribute (for Unicode 5.1). + + Corrected the Vai script value to Vaii. + + Removed the discussion of elements or attributes in different namespace. + Removed the code-point element. + + + + + + Promoted to Draft UAX. + Changed the title from "An XML representation of the UCD" + Value 5.1 added to the age attribute (for Unicode 5.1). + + Value SM added to the gcb attribute (for Unicode 5.1). + + Values CR, Extend, LF, MB added to the + WB + attribute(forUnicode5.1). + + Values CR, EX, LF, SC added to the SB + attribute(forUnicode5.1). + + Value Burushaski_Yeh_Barree added to the jg attribute (for + Unicode5.1). + + Value Alef_Maqsurah added to the jg attribute (for Unicode 2.x). + + Values Cari, Cham, Kali, Lepc, + Lyci, Lydi, Olck, Rjng, Saur, Sund and + Vai + added to the sc attribute (forUnicode5.0). + + + jamo + attribute renamed to + JSN + + + sfc + attribute renamed to + scf + + Attribute kXHC1983 added (for Unicode 5.1.0). + + Pattern for attribute kIRG_USource extended (for Unicode 5.1.0). + + Element provisional-named-sequences added (for Unicode 5.0) + + + + + + First working draft. + + + + + + + This annex describes an XML representation of the Unicode Character Database. + + + + +
    + Introduction + In working on Unicode implementations, it is often useful to access the full content of the Unicode + Character Database (UCD). For example, in establishing mappings from characters to glyphs in fonts, it is + convenient to see the character scalar value, the character name, the character East Asian width, along with + the shape and metrics of the proposed glyph to map to; looking at all this data simultaneously helps in + evaluating the mapping. + + Directly accessing the data files that constitute the UCD is sometimes a daunting proposition. The data is + dispersed in a number of files of various formats, and there are just enough peculiarities (all justified by + the processing power available at the time the UCD representation was designed) to require a fairly intimate + knowledge of the data format itself, in addition to the meaning of the data. + + Many programming environments (for example, Java or ICU) do give access to the UCD. However, those + environments tend to lag behind releases of the standard, or support only some of the UCD content. + + Unibook is a wonderful tool to explore the UCD and in many cases is just the ticket; however, it is + difficult to use when the task at hand has not been built-in, or when non-UCD data is to be displayed as + well. + + This annex presents an alternative representation of the UCD, which is meant to overcome these + difficulties. We have chosen an XML representation, because parsing becomes a non-issue: there are a number + of XML parsers freely available, and using them is often fairly easy. In addition, there are freely + available tools that can perform powerful operations on XML data; for example, XPATH and XQUERY engines can + be thought of as a “grep” for XML data and XSLT engines can be thought of as + “awk” for XML data. + + It is important to note that we are interested in exploring the content of the UCD, rather than in using + the UCD data to process character streams. Thus, we are not concerned so much by the speed of processing or + the size of our representation. + + Our representation supports the creation of documents that represent only parts of the UCD, either by not + representing all the characters, or by not representing all the properties. This can be useful when only + some of the data is needed. + + This annex presents only the XML representation format of the UCD. The data itself is part of the Unicode + Character Database. + +
    + + + +
    + Overall schema + +
    + General principles + Our schema can be used to create and validate documents which are intended to represent properties of + Unicode code points, blocks, named sequences, normalization corrections, standardized variants, CJK + radicals and emoji sources. A document may represent the values actually assigned in a given version of + the UCD, or it may represent a draft version of the UCD, or a private agreement on Private Use + characters. The validity of a XML document with respect to the schema defined in this annex does not + assert anything about the correctness of the values. + + Valid documents may provide values for only some of the code points, or some of the Unicode + properties. Furthermore, they may also incorporate non-Unicode properties. + + Our schema is defined using English. However, a useful subset of the validity constraints can be + captured using a schema language, thereby simplifying the task of validating documents. We have chosen + Relax NG [ISO 19757], + in the compact syntax , as the schema language. It is important to stress that the schema which is + defined in English imposes more constraints on the documents than can be validated with the Relax NG + schema. + + An important characteristic of Relax NG is that its schemas do not modify or augment the infoset of + the documents. Therefore, it is possible to process our XML representation without using the schema. + Also, the schema is relatively straightforward and can be converted mechanically to other schema + languages. + + While our XML representation is not intended to be used during processing of characters and strings, + it is still a design principle for our schema to support the relatively efficient representation of the + UCD. This is achieved by an inheritance mechanism, similar to property inheritance in CSS or in XSL:FO + (see section 4.3 Group). + + Many invariants impose constraints on the values of the different properties for a given code point. + For example, if the value of the Numeric Type property is None, then the value of the + Numeric Value property should be the empty string; and if the value of the Other + Alphabetic property is true, then the value of the Alphabetic property should be + true. Those invariants are not captured in the schema. + +
    + + +
    + Namespace + The namespace for our elements is “http://www.unicode.org/ns/2003/ucd/1.0”. Our + attributes are in the empty namespace. + + + In all our examples, we assume that this namespace is the default one. + +
    + + +
    + Datatypes + We use a standard XML Schema datatypes: + + Characters are pervasive in the UCD, and will need to be represented. Representing characters directly + by themselves would seem the most obvious choice; for example, we could express that the decomposition + of U+00E8 is “&#x0065;&#x0300;”, that is have exactly two characters in (the + infoset of) the XML document. However, the current XML specification limits the set of characters + that can be part of a document. Another problem is that the various tools (XML parser, XPATH engine, + etc.) may equate U+00E8 with U+0065 U+0300, thus making it difficult to figure out which of the two + sequences is contained in the database (which is sometimes important for our purposes). Therefore, we + chose instead to represent characters by their code points; we follow the usual convention of four to + six hexadecimal digits (uppercase) and code points in a sequence separated by space; for example, the + decomposition of U+00E8 will be represented by the nine characters “0065 0300” in the + infoset. + + +
    + + +
    + Root Element + The root element of valid documents is a ucd. + + +
    + + +
    + Common attributes + A large number of properties are boolean. We uniformly use the values Y and + N for those: + + +
    + + +
    + Ordering of elements + In elements that hold lists of child elements, such as repertoire, + group, or standardized-variants, the schema does not require that the + child elements be in any particular order. + +
    +
    + + +
    + Description + The root element may have a description child element, which in turn contains any string, + which is meant to describe what the XML document purports to describe. + + It is recommended that if the document purports to represent the UCD of some Unicode version, the + description be selected in accord with the rules listed in [Versions]; and + conversely, that documents which do not purport to represent the UCD be described as such. + + +
    + + +
    + Repertoire + The repertoire child element of the ucd element describes the code points and + their properties. As we will see shortly, code points can be described individually or as part of a group: + + + + +
    + Sets of code points + It is often the case that successive code points have the same property values, for a given set of + properties. The most striking example is that of an unallocated plane, where all but the last two + code points are reserved and have the same property values. Another example is the URO (U+4E00 + .. U+9FA5) where all the code points have the same property values if we ignore their name and their + Unihan properties. + + + This observation suggests that it is profitable to represent sets of code points which share the + same properties, rather than individual code points. To make the representation of the sets simple, + we restrict them to be segments in the code point space, that is a set is defined by the first and + last code point it contains. Those are captured by the attributes first-cp and + last-cp. The attribute cp is a shorthand notation for the case where the set + has a single code point. + + In the repertoire, there must be at most one code-point + element for a given code point. + +
    + + +
    + Code point types + When thinking about Unicode code points, it is useful to split them into four types: + + + those assigned to abstract characters (PUA or not) + the noncharacters + the surrogate code points + the reserved code points + + This leads to four elements to describe sets of code points: + + +
    + + +
    + Group + While we already recognized the situation where a set of code points have exactly the same set of + property values, another common situation is that of code points which have almost all the same + property values. + + For example, the characters U+1740 BUHID LETTER A .. U+1753 BUHID VOWEL SIGN U all have the age + “3.2”, and all have the script “Buhd”. On the one hand, it is convenient + to support data files in which those properties are explicitly listed with every code point, at this + makes answering questions like “what is the age of U+1749?” easier, because that data + is expressed right there. On the other hand, this leads to rather large data files, and it also tends + to obscure the differences between similar characters. + + + Our representation accounts for this situation with the notion of groups. A + group element is simply a container of code points that also holds default values for + the properties. If a code point inside a group does not list explicitly a property but the + group lists it, then the code point inherits that property from its + group. For example, the fragment with explicit properties: + + + <char cp="1740" age="3.2" na="BUHID LETTER A" gc="Lo" sc="Buhd"/> + <char cp="1741" age="3.2" na="BUHID LETTER I" gc="Lo" sc="Buhd"/> + <char cp="1752" age="3.2" na="BUHID VOWEL SIGN I" gc="Mn" sc="Buhd"/> + <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" gc="Lo" sc="Mong"/> + is equivalent to this fragment which uses a group: + + + <group age="3.2" gc="Lo" sc="Buhd"> + <char cp="1740" na="BUHID LETTER A"/> + <char cp="1741" na="BUHID LETTER I"/> + <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/> + <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/> + </group> + The element for U+1740 does not have the age attribute, and it therefore inherits it + from its enclosing group element, that is “3.2”. On the other hand, + the element for U+1820 does have this attribute, so the value is “3.0”. + + As this example illustrates, the notion of group does not necessarily align with the + notion of Unicode block. It is entirely defined and limited to our representation. In particular, the + value of a property for a code point can always be determined from the XML document alone, assuming + that this property and this code point are expressed at all. Of course, one may create an XML + representation where the groups happen to coincide with the Unicode blocks. + + Groups cannot be nested. The motivation for this limitation is to make the life of consumers + easier: either a property is defined by the element for a code point, or it is defined by the + immediately enclosing group element. + + +
    + + +
    + Properties + Each property, except for the Special_Case_Condition and Name_Alias + properties, is represented by an attribute. In an XML data file, the absence of an attribute (may be + only on some code-points) means that the document does not express the value + of the corresponding property. Conversely, the presence of an attribute is an expression of the + corresponding property value; the implied null value is represented by the empty string. + + The Name_Alias property is represented by zero or more name-alias child + elements. Unlike the situation for properties represented by attributes, it is not possible to determine + whether all the aliases have been represented in a data file by inspecting that data file. + + The name of an attribute is the abbreviated name of the property as given in the file + PropertyAliases.txt in the corresponding version of the UCD. For the Unihan + properties, the name is that given in the various versions of the Unihan database. + + For catalog and enumerated properties, the values are those listed in the file + PropertyValueAliases.txt in the corresponding version of the UCD; if there is an abbreviated + name, it is used, otherwise the long name is used. + + Note that the set of possible values for a property captured in this schema may change from one + version to the next. + + + +
    + Age property + The age attribute captures the version of Unicode in which a code point was + assigned to an abstract character, or made a surrogate or non-character. + + +
    + + +
    + Name properties + There are two name properties: the name given by the current version of the standard + (na), and possibly the name this character had in version 1.0 of the standard + (na1). + + + + The majority of the characters in Unicode have a name which is of the form CJK UNIFIED + IDEOGRAPH-<code point>. It also happens that character names cannot + contain the character U+0023 # NUMBER SIGN, so we adopted the following convention: if a + code point has the attribute na (either directly or by inheritance from an enclosing + group), then occurrences of the character # in the name are to be interpreted as the value of the + code point. For example: + + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/> + and + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/> + are equivalent. The # can be in any position in the value of the na + attribute. The convention also applies just as well to a set of multiple code points: + + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/> + <char cp="3401" na="CJK UNIFIED IDEOGRAPH-3401"/> + is equivalent to + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/> + <char cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/> + which in turn is equivalent to: + + <char first-cp="3400" last-cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/> +
    + + +
    + Name Alias properties + The Name_Alias property is represented by zero or more name-alias + child elements: + + +
    + + +
    + Block property + The Block property is represented by the blk attribute: + + +
    + + +
    + General Category + The general category is represented by the gc attribute. + + +
    + + +
    + Combining properties + The combining class is represented by the ccc attribute, which holds the decimal + representation of the combining class. + + Because the set of values that this property has taken across the various versions of the UCD + is rather large, our schema does not restrict the possible values to those actually used. + + +
    + + +
    + Bidirectionality properties + The bidirectional class is represented by the bc attribute. + + + The mirrored property is represented by the Bidi_M attribute, which takes a + boolean value. + + + The bmg attribute is the code point of a character whose glyph is typically + a mirrored image of the glyph for the current character. + + + Note that we do not express the “Best Fit” element recorded in BidiMirroring.txt. + For one thing, it is not meant to be machine readable. More importantly, the idea underlying the + mirrored glyph is delicate to use, since it makes assumptions about the design of the fonts, and + the best fit goes even farther. + + The Bidi_Control property is represented by the Bidi_C attribute. + + + The bidi paired bracket type and bidi paired bracket properties are represented by the + bpt and bpb attributes respectively. + + + +
    + + +
    + Decomposition properties + The decomposition type and decomposition mapping properties are represented by the dt + and dm attributes. + + Most characters have a decomposition mapping to themselves. This is very similar to the + situation we encountered with names, and we adopted a similar convention: if the value of a + decomposition mapping is the character itself, we use the attribute value # (U+0023 # + NUMBER SIGN) as a shorthand notation; this enables those attributes to be captured in groups. + + + The properties Composition_Exclusion and Full_Composition_Exclusion are + represented by the attributes CE and Comp_Ex: + + + The properties NFC_Quick_Check, NFD_Quick_Check, + NFKC_Quick_Check, NFKD_Quick_Check, Expands_On_NFC, + Expands_On_NFD, Expands_On_NFKC, Expands_On_NKFD, + FC_NFKC_Closure have corresponding attributes. + + +
    + + +
    + Numeric Properties + The numeric type is represented by the nt attribute. + + The numeric value is represented by the nv attribute, represented as a whole + number or a fraction. + + +
    + + +
    + Joining properties + The joining class of a character is represented by the jt attribute. + + The jg attribute is the joining group of the character. + + + The Join_Control property is represented by the Join_C attribute. + + +
    + + +
    + Linebreak properties + The Line_Break property is represented by the lb attribute. + + +
    + + +
    + East Asian Width property + The East Asian width property is represented by the ea attribute. + + +
    + + +
    + Case properties + The Uppercase, Lowercase, Other_Uppercase and + Other_Lowercase properties are represented by corresponding attributes. + + + Most characters have a case mapping and case folding properties that simply map or fold to + themselves. This is very similar to the situation we encountered with names, and we adopted a + similar convention: if the value of a case mapping or case folding property is the character + itself, we use the attribute value # (U+0023 # NUMBER SIGN) as a shorthand notation; this + enables those attributes to be captured in groups. + + The simple case mappings are recorded in the suc, slc, stc + attributes. + + + The non-simple casing are recorded in the uc, lc and tc + attributes. + + + The Simple_Case_Folding and Case_Folding properties are recorded in the + scf and cf attributes respectively. + + + The Case_Ignorable, Cased, Changes_When_Casefolded, + Changes_When_Casemapped, Changes_When_Lowercased, + Changes_When_NFKC_Casefolded, Changes_When_Titlecased, + Changes_When_Uppercased, NFKC_Casefold, and + NFKC_Simple_Casefold properties are recorded in these attributes: + + + Note that the UCD records more information about case folding than is expressed in the + properties, specifically the entries in CaseFolding.txt with status T. + +
    + + +
    + Script properties + The script and script extension properties are represented by the sc and + scx attributes respectively. + + +
    + + +
    + ISO Comment properties + The ISO 10646 comment field is represented by the isc attribute. + + +
    + + +
    + Hangul properties + The property Hangul_Syllable_Type is represented by the hst attribute. + + + The property Jamo_Short_Name is represented by the JSN attribute: + + +
    + + +
    + Indic properties + The property Indic_Syllabic_Category is represented by the InSC + attribute. + + + The property Indic_Positional_Category is represented by the InPC + attribute: + + + The property Indic_Conjunct_Break is represented by the InCB attribute: + + +
    + + +
    + Identifier and Pattern and programming language properties + + The properties ID_Start, Other_ID_Start, XID_Start, + ID_Continue, Other_ID_Continue, XID_Continue, + ID_Compat_Math_Start, and ID_Compat_Math_Continue are represented by + corresponding attributes: + + + The properties Pattern_Syntax and Pattern_White_Space are represented + by corresponding attributes: + + +
    + + +
    + Properties related to function and graphic characteristics + The properties Dash, Hyphen, Quotation_Mark, + Terminal_Punctuation, Sentence_Terminal, Diacritic, + Extender, Soft_Dotted, Alphabetic, + Other_Alphabetic, Math, Other_Math, Hex_Digit, + ASCII_Hex_Digit, Default_Ignorable_Code_Point, + Other_Default_Ignorable_Code_Point, Logical_Order_Exception, + Prepended_Concatenation_Mark, Modifier_Combining_Mark, + White_Space, Vertical_Orientation, and Regional_Indicator + describe the function or graphic characteristic of a character, and have each a corresponding + attribute. + + +
    + + +
    + Properties related to boundaries + The properties Grapheme_Base, Grapheme_Extend, + Other_Grapheme_Extend, Grapheme_Link, + Grapheme_Cluster_Break, Word_Break, and Sentence_Break each + have a corresponding attribute: + + +
    + + +
    + Properties related to ideographs + The properties Ideographic, Unified_Ideograph, + Equivalent_Unified_Ideograph, IDS_Binary_Operator, + IDS_Trinary_Operator, IDS_Unary_Operator, and Radical have + corresponding attributes: + + +
    + + +
    + Miscellaneous properties + The properties Deprecated, Variation_Selector, and + Noncharacter_Code_Point have corresponding attributes: + + +
    + + +
    + Unihan properties + The Unihan properties (from the Unihan database) are represented as attributes. + + +
    + + +
    + Tangut data + The Tangut data are represented as attributes. The attribute kRSTUnicode + represents the radical stroke index. The attribute kTGT_MergedSrc indicates the + source reference for the character. + + +
    + + +
    + Nushu data + The Nushu data are represented as attributes. The attribute kSrc_NushuDuben + indicates the page number and order of the item from the NushuDuben reference source. Nushu common + reading is represented as kReading. + +
    + + +
    + Emoji properties + The properties Emoji, EPres, EMod, EBase, + EComp, and ExtPict have corresponding attributes: + + +
    +
    +
    + + +
    + Blocks + The blocks child of the ucd describes the blocks. It has one child + block element per block, with attributes to describe the extent and name of the block. + + +
    + + +
    + Named Sequences + The named-sequences child of the ucd describes the named sequences. It has one + child named-sequence element per named sequence, with attributes to describe the name and + sequence. + + Similarly, the provisional-named-sequences child of the ucd describes the + provisional named sequences. + + +
    + + +
    + Normalization Corrections + The normalization-corrections child of the ucd describes the normalization + corrections. It has one child normalization-correction element per correction, with + attributes to describe the code point affected, its old normalization, its new normalization and the + version of Unicode in which the correction was made. + + +
    + + +
    + Standardized Variants + The standardized-variants child of the ucd describes the standardized + variant. It has one child element standardized-variant per variant. The attributes on that + last element capture the variation sequence, the description of the desired appearance, and the shaping + environment under which the appearance is different. + + +
    + + +
    + CJK Radicals + The cjk-radicals child of the ucd describes the CJK radicals. It has one + child element cjk-radical per radical. The attributes on that last element capture the + radical number, the corresponding CJK radical character, and the corresponding CJK unified ideograph. + + +
    + + +
    + Emoji sources + The emoji-sources child of the ucd describes the emoji sources. + + + + +
    + + +
    + Do Not Emit + The do-not-emit child of the ucd describes the + character sequences that should not be emitted or generated in newly authored texts. + + + +
    + + +
    + The full schema + Our schema is just the accumulation of the pieces we have described so far: + + + + + + + + + + + + + + + + + + + + + An expanded version is linked from the top of this document. +
    + + +
    + Examples + Here is a fragment of the UCD for a few representative + characters (only some of the properties are represented): + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + +
    + + + + Acknowledgments + Thanks to Markus Scherer and Mark Davis for their help developing this XML representation. Thanks to + the reviewers: Julie Allen, Ernest van den Boogaard, Daniel Bünzli, John Cowan, Asmus Freytag, + Felix Sasaki, Andrew West. Special thanks to Eric Muller and LaurenČ›iu Iancu. + + +
    diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl b/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl new file mode 100644 index 0000000000..f0a95fa958 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl @@ -0,0 +1,611 @@ + + + + + + + + + + + + + + + + + + + + + <xsl:choose> + <xsl:when test="articleinfo/unicode:tr/@class='uax'"> + <xsl:text>UAX</xsl:text> + </xsl:when> + <xsl:when test="articleinfo/unicode:tr/@class='uts'"> + <xsl:text>UTS</xsl:text> + </xsl:when> + <xsl:when test="articleinfo/unicode:tr/@class='utr'"> + <xsl:text>UTR</xsl:text> + </xsl:when> + </xsl:choose> + <xsl:text> #</xsl:text> + <xsl:value-of select="articleinfo/unicode:tr/@number"/> + <xsl:text>: </xsl:text> + <xsl:value-of select="title"/> + + + + + + + + + + + + +
    + + [Unicode] +  Technical Reports +
     
    +
    +

    + + + + + Unicode® Standard Annex + + + Unicode® Technical Standard + + + Unicode® Technical Report + + + # + +

    +

    + + +
    + + +
    + + + + + + +
    + +

    Modifications

    +

    This section indicates the changes introduced by each revision.

    + +
    + +
    + + + + + Working draft + + + Proposed Update + + + + + + + + + + + + + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .html + + + + + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .html + + + + https://www.unicode.org/reports/tr + + / + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .rnc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Version + Unicode + + + + +
    + + + Editor + + + Editors + + + + +
    Date + + + + +
    This Version + + + + +
    Previous Version + + + n/a + + + + + + + + +
    Latest Version + +
    Latest Proposed Update + proposed.html +
    Schema + + + + +
    Revision + + + + + + +
    +
    + + + + + + + + + +
    +
    + + + ( + mailto: + ) + + + + + +

    Summary

    + +
    + + +

    +
    + + + + +

    Status

    + + +

    This document has been reviewed by Unicode members and other interested parties, and has been + approved for publication by the Unicode Consortium. This is a stable document and may be used as reference + material or cited as a normative reference by other specifications.

    +
    + +

    + + This is a draft document which may be updated, replaced, or + superseded by other documents at any time. Publication does not imply endorsement by the Unicode + Consortium. This is not a stable document; it is inappropriate to cite this document as other than a + work in progress.

    +
    +
    + + +
    +

    A Unicode Standard Annex (UAX) forms an integral part of the Unicode Standard, but is + published online as a separate document. The Unicode Standard may require conformance to normative + content in a Unicode Standard Annex, if so specified in the Conformance chapter of that version of the + Unicode Standard. The version number of a UAX document corresponds to the version of the Unicode Standard + of which it forms a part.

    +
    +

    Please submit corrigenda and other comments with the online reporting form [Feedback]. Related information that is useful in + understanding this annex is found in Unicode Standard Annex #41, “Common References for Unicode Standard + Annexes.” For the latest version of the Unicode Standard, see [Unicode]. For a list of current Unicode + Technical Reports, see [Reports]. For more information about + versions of the Unicode Standard, see [Versions]. For any + errata which may apply to this annex, see [Errata].

    +
    + +
    +

    A Unicode Technical Standard (UTS) is an independent specification. Conformance to the Unicode + Standard does not imply conformance to any UTS.

    +
    +

    Please submit corrigenda and other comments with the online reporting form [ + Feedback]. Related information that is useful in understanding this document is found in References. For the latest version of the Unicode Standard see [Unicode]. For a list of current Unicode Technical Reports see [Reports]. For more information about versions of the Unicode Standard, see + [Versions].

    +
    + +
    +

    A Unicode Technical Report (UTR) contains informative material. Conformance to the Unicode + Standard does not imply conformance to any UTR. Other specifications, however, are free to make normative + references to a UTR.

    +
    +

    Please submit corrigenda and other comments with the online reporting form [ + Feedback]. Related information that is useful in understanding this document is found in References. For the latest version of the Unicode Standard see [Unicode]. For a list of current Unicode Technical Reports see [Reports]. For more information about versions of the Unicode Standard, see + [Versions].

    +
    +
    +
    + + + + +

    Contents

    + +
    + + +
  • + + +
      + +
    +
    +
  • +
    + + + + + + +      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    + + + +

    +
    + + +

    +
    + + +
    +
    + + + + + + + + + _blank + + + + + + + + + + + + + + + + + + + [ + + + + + + + + + + + + : + + + , ] + + + + +

    + [, + ] + + = + + + +

    +
    + + +

    + [] + + = + +

    +
    + + + + + + + + +
    +

    + Revision +

    + +
    +
    + + +
    +

    + +

    +
    +
    + + +
      + +
    +
    + + +
  • + +
  • +
    + + + + + + + + + + + + + + + + background-color: #ffff00; border-style:dotted; border-width:1px + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl b/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl new file mode 100644 index 0000000000..b7a8dfa819 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl @@ -0,0 +1,45 @@ + + + + + + + + + + + + # Copyright © Unicode, Inc. + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/output/index.html b/unicodetools/src/main/resources/org/unicode/uax42/output/index.html new file mode 100644 index 0000000000..13bf8181d1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/output/index.html @@ -0,0 +1,3482 @@ + + + + + + + UAX #42: Unicode Character Database in XML + + + + + + + + + + + +
    + + [Unicode] +  Technical Reports +
     
    +
    +

    + Proposed Update Unicode® Standard Annex #42

    +

    Unicode Character Database in XML

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    VersionUnicode 16.0.0 +
    + Editor + + John Wilcock
    +
    Date + 2024-08-15 +
    This Version + + https://www.unicode.org/reports/tr42/tr42-36.html + +
    Previous Version + + https://www.unicode.org/reports/tr42/tr42-34.html + +
    Latest Version + https://www.unicode.org/reports/tr42/ +
    Latest Proposed Update + https://www.unicode.org/reports/tr42/proposed.html +
    Schema + + https://www.unicode.org/reports/tr42/tr42-36.rnc + +
    Revision + + 36 + +
    +

    Summary

    +

    + This annex describes an XML representation of the Unicode Character Database. +

    +

    + Status +

    +

    + This is a + draft + document which may be updated, replaced, or + superseded by other documents at any time. Publication does not imply endorsement by the Unicode + Consortium. This is not a stable document; it is inappropriate to cite this document as other than a + work in progress. +

    +
    +

    + + A Unicode Standard Annex (UAX) forms an integral part of the Unicode Standard, but is + published online as a separate document. The Unicode Standard may require conformance to normative + content in a Unicode Standard Annex, if so specified in the Conformance chapter of that version of the + Unicode Standard. The version number of a UAX document corresponds to the version of the Unicode Standard + of which it forms a part. +

    +
    +

    + Please submit corrigenda and other comments with the online reporting form [Feedback]. Related information that is useful in + understanding this annex is found in Unicode Standard Annex #41, “Common References for Unicode Standard + Annexes.” For the latest version of the Unicode Standard, see [Unicode]. For a list of current Unicode + Technical Reports, see [Reports]. For more information about + versions of the Unicode Standard, see [Versions]. For any + errata which may apply to this annex, see [Errata]. +

    +

    Contents

    + +
    +

    + 1 Introduction +

    +

    In working on Unicode implementations, it is often useful to access the full content of the Unicode + Character Database (UCD). For example, in establishing mappings from characters to glyphs in fonts, it is + convenient to see the character scalar value, the character name, the character East Asian width, along with + the shape and metrics of the proposed glyph to map to; looking at all this data simultaneously helps in + evaluating the mapping. +

    +

    Directly accessing the data files that constitute the UCD is sometimes a daunting proposition. The data is + dispersed in a number of files of various formats, and there are just enough peculiarities (all justified by + the processing power available at the time the UCD representation was designed) to require a fairly intimate + knowledge of the data format itself, in addition to the meaning of the data. +

    +

    Many programming environments (for example, Java or ICU) do give access to the UCD. However, those + environments tend to lag behind releases of the standard, or support only some of the UCD content. +

    +

    Unibook is a wonderful tool to explore the UCD and in many cases is just the ticket; however, it is + difficult to use when the task at hand has not been built-in, or when non-UCD data is to be displayed as + well. +

    +

    This annex presents an alternative representation of the UCD, which is meant to overcome these + difficulties. We have chosen an XML representation, because parsing becomes a non-issue: there are a number + of XML parsers freely available, and using them is often fairly easy. In addition, there are freely + available tools that can perform powerful operations on XML data; for example, XPATH and XQUERY engines can + be thought of as a “grep” for XML data and XSLT engines can be thought of as + “awk” for XML data. +

    +

    It is important to note that we are interested in exploring the content of the UCD, rather than in using + the UCD data to process character streams. Thus, we are not concerned so much by the speed of processing or + the size of our representation. +

    +

    Our representation supports the creation of documents that represent only parts of the UCD, either by not + representing all the characters, or by not representing all the properties. This can be useful when only + some of the data is needed. +

    +

    This annex presents only the XML representation format of the UCD. The data itself is part of the Unicode + Character Database. +

    +

    + 2 Overall schema +

    +

    + 2.1 General principles +

    +

    Our schema can be used to create and validate documents which are intended to represent properties of + Unicode code points, blocks, named sequences, normalization corrections, standardized variants, CJK + radicals and emoji sources. A document may represent the values actually assigned in a given version of + the UCD, or it may represent a draft version of the UCD, or a private agreement on Private Use + characters. The validity of a XML document with respect to the schema defined in this annex does not + assert anything about the correctness of the values. +

    +

    Valid documents may provide values for only some of the code points, or some of the Unicode + properties. Furthermore, they may also incorporate non-Unicode properties. +

    +

    Our schema is defined using English. However, a useful subset of the validity constraints can be + captured using a schema language, thereby simplifying the task of validating documents. We have chosen + Relax NG [ISO 19757], + in the compact syntax , as the schema language. It is important to stress that the schema which is + defined in English imposes more constraints on the documents than can be validated with the Relax NG + schema. +

    +

    An important characteristic of Relax NG is that its schemas do not modify or augment the infoset of + the documents. Therefore, it is possible to process our XML representation without using the schema. + Also, the schema is relatively straightforward and can be converted mechanically to other schema + languages. +

    +

    While our XML representation is not intended to be used during processing of characters and strings, + it is still a design principle for our schema to support the relatively efficient representation of the + UCD. This is achieved by an inheritance mechanism, similar to property inheritance in CSS or in XSL:FO + (see section 4.3 Group). +

    +

    Many invariants impose constraints on the values of the different properties for a given code point. + For example, if the value of the Numeric Type property is None, then the value of the + Numeric Value property should be the empty string; and if the value of the Other + Alphabetic property is true, then the value of the Alphabetic property should be + true. Those invariants are not captured in the schema. +

    +

    + 2.2 Namespace +

    +

    The namespace for our elements is “http://www.unicode.org/ns/2003/ucd/1.0”. Our + attributes are in the empty namespace. +

    +

    + + [namespace declaration, + 1] + + = + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + +

    +

    In all our examples, we assume that this namespace is the default one. +

    +

    + 2.3 Datatypes +

    +

    We use a standard XML Schema datatypes:

    +

    + + [datatypes declaration, + 2] + + = + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + +

    +

    Characters are pervasive in the UCD, and will need to be represented. Representing characters directly + by themselves would seem the most obvious choice; for example, we could express that the decomposition + of U+00E8 is “&#x0065;&#x0300;”, that is have exactly two characters in (the + infoset of) the XML document. However, the current XML specification limits the set of characters + that can be part of a document. Another problem is that the various tools (XML parser, XPATH engine, + etc.) may equate U+00E8 with U+0065 U+0300, thus making it difficult to figure out which of the two + sequences is contained in the database (which is sometimes important for our purposes). Therefore, we + chose instead to represent characters by their code points; we follow the usual convention of four to + six hexadecimal digits (uppercase) and code points in a sequence separated by space; for example, the + decomposition of U+00E8 will be represented by the nine characters “0065 0300” in the + infoset. +

    +

    + + [datatype for code points, + 3] + + = + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + +

    +

    + 2.4 Root Element +

    +

    The root element of valid documents is a ucd. +

    +

    + + [schema start, + 4] + + = + + start = + element ucd { ucd.content } + +

    +

    + 2.5 Common attributes +

    +

    A large number of properties are boolean. We uniformly use the values Y and + N for those: +

    +

    + + [boolean, + 5] + + = + + boolean = "Y" | "N" + +

    +

    + 2.6 Ordering of elements +

    +

    In elements that hold lists of child elements, such as repertoire, + group, or standardized-variants, the schema does not require that the + child elements be in any particular order. +

    +

    + 3 Description +

    +

    The root element may have a description child element, which in turn contains any string, + which is meant to describe what the XML document purports to describe. +

    +

    It is recommended that if the document purports to represent the UCD of some Unicode version, the + description be selected in accord with the rules listed in [Versions]; and + conversely, that documents which do not purport to represent the UCD be described as such. +

    +

    + + [description, + 6] + + = + + ucd.content &= + element description { text }? + +

    +

    + 4 Repertoire +

    +

    The repertoire child element of the ucd element describes the code points and + their properties. As we will see shortly, code points can be described individually or as part of a group: +

    +

    + + [repertoire, + 7] + + = + + ucd.content &= + element repertoire { (code-point | group) + }? + +

    +

    + 4.1 Sets of code points +

    +

    It is often the case that successive code points have the same property values, for a given set of + properties. The most striking example is that of an unallocated plane, where all but the last two + code points are reserved and have the same property values. Another example is the URO (U+4E00 + .. U+9FA5) where all the code points have the same property values if we ignore their name and their + Unihan properties. +

    +

    + + [Set of code points, + 8] + + = + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + +

    +

    This observation suggests that it is profitable to represent sets of code points which share the + same properties, rather than individual code points. To make the representation of the sets simple, + we restrict them to be segments in the code point space, that is a set is defined by the first and + last code point it contains. Those are captured by the attributes first-cp and + last-cp. The attribute cp is a shorthand notation for the case where the set + has a single code point. +

    +

    In the repertoire, there must be at most one code-point + element for a given code point. +

    +

    + 4.2 Code point types +

    +

    When thinking about Unicode code points, it is useful to split them into four types: +

    + those assigned to abstract characters (PUA or not) + the noncharacters + the surrogate code points + the reserved code points +

    This leads to four elements to describe sets of code points: +

    +

    + + [Code points, + 9] + + = + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + +

    +

    + 4.3 Group +

    +

    While we already recognized the situation where a set of code points have exactly the same set of + property values, another common situation is that of code points which have almost all the same + property values. +

    +

    For example, the characters U+1740 BUHID LETTER A .. U+1753 BUHID VOWEL SIGN U all have the age + “3.2”, and all have the script “Buhd”. On the one hand, it is convenient + to support data files in which those properties are explicitly listed with every code point, at this + makes answering questions like “what is the age of U+1749?” easier, because that data + is expressed right there. On the other hand, this leads to rather large data files, and it also tends + to obscure the differences between similar characters. +

    +

    Our representation accounts for this situation with the notion of groups. A + group element is simply a container of code points that also holds default values for + the properties. If a code point inside a group does not list explicitly a property but the + group lists it, then the code point inherits that property from its + group. For example, the fragment with explicit properties: +

    +
    +    <char cp="1740" age="3.2" na="BUHID LETTER A" gc="Lo" sc="Buhd"/>
    +    <char cp="1741" age="3.2" na="BUHID LETTER I" gc="Lo" sc="Buhd"/>
    +    <char cp="1752" age="3.2" na="BUHID VOWEL SIGN I" gc="Mn" sc="Buhd"/>
    +    <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" gc="Lo" sc="Mong"/>
    +

    is equivalent to this fragment which uses a group: +

    +
    +    <group age="3.2" gc="Lo" sc="Buhd">
    +        <char cp="1740" na="BUHID LETTER A"/>
    +        <char cp="1741" na="BUHID LETTER I"/>
    +        <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/>
    +        <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/>
    +    </group>
    +

    The element for U+1740 does not have the age attribute, and it therefore inherits it + from its enclosing group element, that is “3.2”. On the other hand, + the element for U+1820 does have this attribute, so the value is “3.0”. +

    +

    As this example illustrates, the notion of group does not necessarily align with the + notion of Unicode block. It is entirely defined and limited to our representation. In particular, the + value of a property for a code point can always be determined from the XML document alone, assuming + that this property and this code point are expressed at all. Of course, one may create an XML + representation where the groups happen to coincide with the Unicode blocks. +

    +

    Groups cannot be nested. The motivation for this limitation is to make the life of consumers + easier: either a property is defined by the element for a code point, or it is defined by the + immediately enclosing group element. +

    +

    + + [groups, + 10] + + = + + group = + element group { + code-point-attributes, + code-point* } + +

    +

    + 4.4 Properties +

    +

    Each property, except for the Special_Case_Condition and Name_Alias + properties, is represented by an attribute. In an XML data file, the absence of an attribute (may be + only on some code-points) means that the document does not express the value + of the corresponding property. Conversely, the presence of an attribute is an expression of the + corresponding property value; the implied null value is represented by the empty string. +

    +

    The Name_Alias property is represented by zero or more name-alias child + elements. Unlike the situation for properties represented by attributes, it is not possible to determine + whether all the aliases have been represented in a data file by inspecting that data file. +

    +

    The name of an attribute is the abbreviated name of the property as given in the file + PropertyAliases.txt in the corresponding version of the UCD. For the Unihan + properties, the name is that given in the various versions of the Unihan database. +

    +

    For catalog and enumerated properties, the values are those listed in the file + PropertyValueAliases.txt in the corresponding version of the UCD; if there is an abbreviated + name, it is used, otherwise the long name is used. +

    +

    Note that the set of possible values for a property captured in this schema may change from one + version to the next. +

    +

    + 4.4.1 Age property +

    +

    The age attribute captures the version of Unicode in which a code point was + assigned to an abstract character, or made a surrogate or non-character. +

    +

    + + [age attribute, + 11] + + = + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + +

    +

    + 4.4.2 Name properties +

    +

    There are two name properties: the name given by the current version of the standard + (na), and possibly the name this character had in version 1.0 of the standard + (na1). +

    +

    + + [na attribute, + 12] + + = + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + +

    +

    + + [na1 attribute, + 13] + + = + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + +

    +

    The majority of the characters in Unicode have a name which is of the form CJK UNIFIED + IDEOGRAPH-<code point>. It also happens that character names cannot + contain the character U+0023 # NUMBER SIGN, so we adopted the following convention: if a + code point has the attribute na (either directly or by inheritance from an enclosing + group), then occurrences of the character # in the name are to be interpreted as the value of the + code point. For example: +

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/>
    +

    and

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    are equivalent. The # can be in any position in the value of the na + attribute. The convention also applies just as well to a set of multiple code points: +

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/>
    +    <char cp="3401" na="CJK UNIFIED IDEOGRAPH-3401"/>
    +

    is equivalent to

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/>
    +    <char cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    which in turn is equivalent to:

    +
    +    <char first-cp="3400" last-cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    + 4.4.3 Name Alias properties +

    +

    The Name_Alias property is represented by zero or more name-alias + child elements: +

    +

    + + [name-alias element, + 14] + + = + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + +

    +

    + 4.4.4 Block property +

    +

    The Block property is represented by the blk attribute: +

    +

    + + [blk attribute, + 15] + + = + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + +

    +

    + 4.4.5 General Category +

    +

    The general category is represented by the gc attribute. +

    +

    + + [gc attribute, + 16] + + = + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + +

    +

    + 4.4.6 Combining properties +

    +

    The combining class is represented by the ccc attribute, which holds the decimal + representation of the combining class. +

    +

    Because the set of values that this property has taken across the various versions of the UCD + is rather large, our schema does not restrict the possible values to those actually used. +

    +

    + + [ccc attribute, + 17] + + = + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + +

    +

    + 4.4.7 Bidirectionality properties +

    +

    The bidirectional class is represented by the bc attribute. +

    +

    + + [bc attribute, + 18] + + = + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + +

    +

    The mirrored property is represented by the Bidi_M attribute, which takes a + boolean value. +

    +

    + + [Bidi_M attribute, + 19] + + = + + code-point-attributes &= + attribute Bidi_M { boolean }? + +

    +

    The bmg attribute is the code point of a character whose glyph is typically + a mirrored image of the glyph for the current character. +

    +

    + + [bmg attribute, + 20] + + = + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + +

    +

    Note that we do not express the “Best Fit” element recorded in BidiMirroring.txt. + For one thing, it is not meant to be machine readable. More importantly, the idea underlying the + mirrored glyph is delicate to use, since it makes assumptions about the design of the fonts, and + the best fit goes even farther. +

    +

    The Bidi_Control property is represented by the Bidi_C attribute. +

    +

    + + [Bidi_C attribute, + 21] + + = + + code-point-attributes &= + attribute Bidi_C { boolean }? + +

    +

    The bidi paired bracket type and bidi paired bracket properties are represented by the + bpt and bpb attributes respectively. +

    +

    + + [bpt attribute, + 22] + + = + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + +

    +

    + + [bpb attribute, + 23] + + = + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + +

    +

    + 4.4.8 Decomposition properties +

    +

    The decomposition type and decomposition mapping properties are represented by the dt + and dm attributes. +

    +

    Most characters have a decomposition mapping to themselves. This is very similar to the + situation we encountered with names, and we adopted a similar convention: if the value of a + decomposition mapping is the character itself, we use the attribute value # (U+0023 # + NUMBER SIGN) as a shorthand notation; this enables those attributes to be captured in groups. +

    +

    + + [decomposition properties, + 24] + + = + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + +

    +

    The properties Composition_Exclusion and Full_Composition_Exclusion are + represented by the attributes CE and Comp_Ex: +

    +

    + + [composition properties, + 25] + + = + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + +

    +

    The properties NFC_Quick_Check, NFD_Quick_Check, + NFKC_Quick_Check, NFKD_Quick_Check, Expands_On_NFC, + Expands_On_NFD, Expands_On_NFKC, Expands_On_NKFD, + FC_NFKC_Closure have corresponding attributes. +

    +

    + + [quick check properties, + 26] + + = + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + +

    +

    + 4.4.9 Numeric Properties +

    +

    The numeric type is represented by the nt attribute. +

    +

    The numeric value is represented by the nv attribute, represented as a whole + number or a fraction. +

    +

    + + [numeric properties, + 27] + + = + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + +

    +

    + 4.4.10 Joining properties +

    +

    The joining class of a character is represented by the jt attribute. +

    +

    The jg attribute is the joining group of the character. +

    +

    + + [joining properties, + 28] + + = + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + +

    +

    The Join_Control property is represented by the Join_C attribute. +

    +

    + + [joining properties, + 29] + + = + + code-point-attributes &= + attribute Join_C { boolean }? + +

    +

    + 4.4.11 Linebreak properties +

    +

    The Line_Break property is represented by the lb attribute. +

    +

    + + [lb attribute, + 30] + + = + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + +

    +

    + 4.4.12 East Asian Width property +

    +

    The East Asian width property is represented by the ea attribute. +

    +

    + + [ea attribute, + 31] + + = + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + +

    +

    + 4.4.13 Case properties +

    +

    The Uppercase, Lowercase, Other_Uppercase and + Other_Lowercase properties are represented by corresponding attributes. +

    +

    + + [casing properties, + 32] + + = + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + +

    +

    Most characters have a case mapping and case folding properties that simply map or fold to + themselves. This is very similar to the situation we encountered with names, and we adopted a + similar convention: if the value of a case mapping or case folding property is the character + itself, we use the attribute value # (U+0023 # NUMBER SIGN) as a shorthand notation; this + enables those attributes to be captured in groups. +

    +

    The simple case mappings are recorded in the suc, slc, stc + attributes. +

    +

    + + [casing properties, + 33] + + = + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + +

    +

    The non-simple casing are recorded in the uc, lc and tc + attributes. +

    +

    + + [casing properties, + 34] + + = + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + +

    +

    The Simple_Case_Folding and Case_Folding properties are recorded in the + scf and cf attributes respectively. +

    +

    + + [casing properties, + 35] + + = + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + +

    +

    The Case_Ignorable, Cased, Changes_When_Casefolded, + Changes_When_Casemapped, Changes_When_Lowercased, + Changes_When_NFKC_Casefolded, Changes_When_Titlecased, + Changes_When_Uppercased, NFKC_Casefold, and + NFKC_Simple_Casefold properties are recorded in these attributes: +

    +

    + + [casing properties, + 36] + + = + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + +

    +

    Note that the UCD records more information about case folding than is expressed in the + properties, specifically the entries in CaseFolding.txt with status T. +

    +

    + 4.4.14 Script properties +

    +

    The script and script extension properties are represented by the sc and + scx attributes respectively. +

    +

    + + [script properties, + 37] + + = + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + +

    +

    + 4.4.15 ISO Comment properties +

    +

    The ISO 10646 comment field is represented by the isc attribute. +

    +

    + + [isc attribute, + 38] + + = + + code-point-attributes &= + attribute isc { text }? + +

    +

    + 4.4.16 Hangul properties +

    +

    The property Hangul_Syllable_Type is represented by the hst attribute. +

    +

    + + [hst attribute, + 39] + + = + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + +

    +

    The property Jamo_Short_Name is represented by the JSN attribute: +

    +

    + + [JSN attribute, + 40] + + = + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + +

    +

    + 4.4.17 Indic properties +

    +

    The property Indic_Syllabic_Category is represented by the InSC + attribute. +

    +

    + + [InSC attribute, + 41] + + = + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + +

    +

    The property Indic_Positional_Category is represented by the InPC + attribute: +

    +

    + + [InPC attribute, + 42] + + = + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + +

    +

    The property Indic_Conjunct_Break is represented by the InCB attribute: +

    +

    + + [InCB attribute, + 43] + + = + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + +

    +

    + 4.4.18 Identifier and Pattern and programming language properties +

    +

    The properties ID_Start, Other_ID_Start, XID_Start, + ID_Continue, Other_ID_Continue, XID_Continue, + ID_Compat_Math_Start, and ID_Compat_Math_Continue are represented by + corresponding attributes: +

    +

    + + [identifier properties, + 44] + + = + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + +

    +

    The properties Pattern_Syntax and Pattern_White_Space are represented + by corresponding attributes: +

    +

    + + [pattern properties, + 45] + + = + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + +

    +

    + 4.4.19 Properties related to function and graphic characteristics +

    +

    The properties Dash, Hyphen, Quotation_Mark, + Terminal_Punctuation, Sentence_Terminal, Diacritic, + Extender, Soft_Dotted, Alphabetic, + Other_Alphabetic, Math, Other_Math, Hex_Digit, + ASCII_Hex_Digit, Default_Ignorable_Code_Point, + Other_Default_Ignorable_Code_Point, Logical_Order_Exception, + Prepended_Concatenation_Mark, Modifier_Combining_Mark, + White_Space, Vertical_Orientation, and Regional_Indicator + describe the function or graphic characteristic of a character, and have each a corresponding + attribute. +

    +

    + + [properties related to function and graphic characteristics, + 46] + + = + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + +

    +

    + 4.4.20 Properties related to boundaries +

    +

    The properties Grapheme_Base, Grapheme_Extend, + Other_Grapheme_Extend, Grapheme_Link, + Grapheme_Cluster_Break, Word_Break, and Sentence_Break each + have a corresponding attribute: +

    +

    + + [properties related to boundaries, + 47] + + = + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + +

    +

    + 4.4.21 Properties related to ideographs +

    +

    The properties Ideographic, Unified_Ideograph, + Equivalent_Unified_Ideograph, IDS_Binary_Operator, + IDS_Trinary_Operator, IDS_Unary_Operator, and Radical have + corresponding attributes: +

    +

    + + [properties related to ideographs, + 48] + + = + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + +

    +

    + 4.4.22 Miscellaneous properties +

    +

    The properties Deprecated, Variation_Selector, and + Noncharacter_Code_Point have corresponding attributes: +

    +

    + + [miscellaneous properties, + 49] + + = + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + +

    +

    + 4.4.23 Unihan properties +

    +

    The Unihan properties (from the Unihan database) are represented as attributes. +

    +

    + + [Unihan properties, + 50] + + = + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + +

    +

    + 4.4.24 Tangut data +

    +

    The Tangut data are represented as attributes. The attribute kRSTUnicode + represents the radical stroke index. The attribute kTGT_MergedSrc indicates the + source reference for the character. +

    +

    + + [Tangut data, + 51] + + = + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + +

    +

    + 4.4.25 Nushu data +

    +

    The Nushu data are represented as attributes. The attribute kSrc_NushuDuben + indicates the page number and order of the item from the NushuDuben reference source. Nushu common + reading is represented as kReading.

    +

    + + [Nushu data, + 52] + + = + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + +

    +

    + 4.4.26 Emoji properties +

    +

    The properties Emoji, EPres, EMod, EBase, + EComp, and ExtPict have corresponding attributes: +

    +

    + + [Emoji properties, + 53] + + = + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + +

    +

    + 5 Blocks +

    +

    The blocks child of the ucd describes the blocks. It has one child + block element per block, with attributes to describe the extent and name of the block. +

    +

    + + [blocks, + 54] + + = + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + +

    +

    + 6 Named Sequences +

    +

    The named-sequences child of the ucd describes the named sequences. It has one + child named-sequence element per named sequence, with attributes to describe the name and + sequence. +

    +

    Similarly, the provisional-named-sequences child of the ucd describes the + provisional named sequences. +

    +

    + + [named sequences, + 55] + + = + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + +

    +

    + 7 Normalization Corrections +

    +

    The normalization-corrections child of the ucd describes the normalization + corrections. It has one child normalization-correction element per correction, with + attributes to describe the code point affected, its old normalization, its new normalization and the + version of Unicode in which the correction was made. +

    +

    + + [normalization corrections, + 56] + + = + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + +

    +

    + 8 Standardized Variants +

    +

    The standardized-variants child of the ucd describes the standardized + variant. It has one child element standardized-variant per variant. The attributes on that + last element capture the variation sequence, the description of the desired appearance, and the shaping + environment under which the appearance is different. +

    +

    + + [standardized variants, + 57] + + = + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + +

    +

    + 9 CJK Radicals +

    +

    The cjk-radicals child of the ucd describes the CJK radicals. It has one + child element cjk-radical per radical. The attributes on that last element capture the + radical number, the corresponding CJK radical character, and the corresponding CJK unified ideograph. +

    +

    + + [cjk radicals, + 58] + + = + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + +

    +

    + 10 Emoji sources +

    +

    The emoji-sources child of the ucd describes the emoji sources. +

    +

    + + [emoji sources, + 59] + + = + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + +

    +

    + + [datatype for code points, + 60] + + = + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + +

    +

    + 11 Do Not Emit +

    +

    + The do-not-emit child of the ucd describes the + character sequences that should not be emitted or generated in newly authored texts. + +

    +

    + + [do-not-emit, + 61] + + = + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + +

    +

    + 12 The full schema +

    +

    Our schema is just the accumulation of the pieces we have described so far: +

    +

    + + [UCD RelaxNG schema] + + = + + + [namespace declaration: 1] + + + [datatypes: 2, 3, 60] + + + [schema start: 4] + + + [boolean: 5] + + + [description: 6] + + + [repertoire: 7, 8, 9, 10] + + + [attributes: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] + + + [Tangut data: 51] + + + [Nushu data: 52] + + + [blocks: 54] + + + [named sequences: 55] + + + [normalization corrections: 56] + + + [standardized variants: 57] + + + [cjk radicals: 58] + + + [emoji sources: 59] + + + [Emoji properties: 53] + + + [do-not-emit: 61] + + +

    +

    An expanded version is linked from the top of this document.

    +

    + 13 Examples +

    +

    Here is a fragment of the UCD for a few representative + characters (only some of the properties are represented): +

    +
    +            
    +  <ucd xmlns="http://www.unicode.org/ns/2003/ucd/1.0">
    +    <repertoire>
    +      <char cp="001F" age="1.1" na="&lt;control&gt;" na1="UNIT SEPARATOR"
    +            gc="Cc" bc="S" lb="CM"/>
    +
    +      <char cp="0020" age="1.1" na="SPACE" gc="Zs" bc="WS" ea="Na" lb="SP"/>
    +
    +      <char cp="0026" age="1.1" na="AMPERSAND" gc="Po" bc="ON" ea="Na"/>
    +
    +      <char cp="0028" age="1.1" na="LEFT PARENTHESIS" na1="OPENING PARENTHESIS"
    +            gc="Ps" bc="ON" Bidi_M="y" bmg="0029" ea="Na" lb="OP"/>
    +
    +      <char cp="0041" age="1.1" na="LATIN CAPITAL LETTER A"
    +            gc="Lu" slc="0061" ea="Na" sc="Latn"/>
    +
    +      <char cp="AC00" age="2.0" na="HANGUL SYLLABLE GA" gc="Lo"
    +            dt="can" dm="1100 1161" ea="W" lb="ID" sc="Hang"/>
    +
    +      <char cp="20094" age="3.1" na="CJK UNIFIED IDEOGRAPH-20094"
    +            gc="Lo" ea="W" lb="ID" sc="Hani" kIRG_GSource="KX"
    +            kIRGHanyuDaZidian="10036.060" kIRG_TSource="5-214E"
    +           kRSUnicode="4.3" kIRGKangXi="0082.090"/>
    +
    +      <group age="3.2" gc="Lo" sc="Buhd">
    +        <char cp="1740" na="BUHID LETTER A"/>
    +        <char cp="1741" na="BUHID LETTER I"/>
    +        <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/>
    +        <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/>
    +      </group>
    +    </repertoire>
    +  </ucd>
    +
    +
    +

    + Acknowledgments +

    +

    Thanks to Markus Scherer and Mark Davis for their help developing this XML representation. Thanks to + the reviewers: Julie Allen, Ernest van den Boogaard, Daniel Bünzli, John Cowan, Asmus Freytag, + Felix Sasaki, Andrew West. Special thanks to Eric Muller and Laurențiu Iancu. +

    +

    + Modifications +

    +

    This section indicates the changes introduced by each revision.

    +
    +

    + Revision 36 +

    +
      +
    • New value for the age attribute: 16.0. +
    • +
    • New values for the blk attribute: Egyptian_Hieroglyphs_Ext_A, + Garay, Gurung_Khema, Kirat_Rai, Myanmar_Ext_C, + Ol_Onal, Sunuwar, Symbols_for_Legacy_Computing_Sup, + Todhri, Tulu_Tigalari. +
    • +
    • New values for the script attribute: Gara, Gukh, + Krai, Onao, Sunu, Todr, Tutg. +
    • +
    • New value for the jg attribute: Kashmiri_Yeh.
    • +
    • New value for the InSC attribute: Reordering_Killer. +
    • +
    • New attributes: MCM, kFanqie, kZhuang. +
    • +
    • Modified patterns for the cjk-radical/@number, kRSUnicode and + kIRG_GSource + attributes. +
    • +
    • Added the do-not-emit element. +
    • +
    +
    +
    +

    Revision 35 being a proposed update, only changes between revisions 34 and 36 are + noted here. +

    +
    +
    +

    + Revision 34 +

    +
      +
    • New value for the age attribute: 15.1. +
    • +
    • New value for the blk attribute: CJK_Ext_I. +
    • +
    • New values for the lb attribute: AK, AP, + AS, VF, VI. +
    • +
    • Modified values for the number, radical attributes of the + cjk-radical + element. +
    • +
    • Changed single value into list for the nv code point attribute. +
    • +
    • New code point attributes: ID_Compat_Math_Continue, + ID_Compat_Math_Start, IDSU, NFKC_SCF, InCB. +
    • +
    • Modified patterns for the kBigFive, kIRG_GSource, + kMorohashi, kRSUnicode attributes. +
    • +
    • Changed single values into lists for the kMorohashi, kPrimaryNumeric + Unihan attributes. +
    • +
    • New Unihan attributes: kJapanese, kMojiJoho, + kSMSZD2003Index, kSMSZD2003Readings, kVietnameseNumeric, + kZhuangNumeric. +
    • +
    +
    +
    +

    Revision 33 being a proposed update, only changes between revisions 32 and 34 are + noted here. +

    +
    +
    +

    + Revision 32 +

    +
      +
    • New value for the age attribute: 15.0. +
    • +
    • New values for the blk attribute: Arabic_Ext_C, CJK_Ext_H, + Cyrillic_Ext_D, Devanagari_Ext_A, Kaktovik_Numerals, Kawi, + Nag_Mundari. +
    • +
    • New values for the script attribute: Kawi, Nagm. +
    • +
    • New Unihan attribute: kAlternateTotalStrokes. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_TSource, kSemanticVariant, kSpecializedSemanticVariant, + kZVariant + attributes. +
    • +
    +
    +
    +

    Revision 31 being a proposed update, only changes between revisions 30 and 32 are + noted here. +

    +
    +
    +

    + Revision 30 +

    +
      +
    • New value for the age attribute: 14.0. +
    • +
    • New values for the blk attribute: Arabic_Ext_B, + Cypro_Minoan, Ethiopic_Ext_B, Kana_Ext_B, + Latin_Ext_F, Latin_Ext_G, Old_Uyghur, Tangsa, + Toto, UCAS_Ext_A, Vithkuqi, Znamenny_Music. +
    • +
    • New values for the script attribute: Cpmn, Ougr, + Tnsa, Toto, Vith. +
    • +
    • New values for the jg attribute: Thin_Yeh, Vertical_Tail. +
    • +
    • New Unihan attribute: kStrange. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_MSource, + kIRG_VSource, kPhonetic, kSpoofingVariant attributes. +
    • +
    • Removal of the kWubi attribute, which has never been present in + released versions of the UCD. +
    • +
    +
    +
    +

    Revision 29 being a proposed update, only changes between revisions 28 and 30 are + noted here. +

    +
    +
    +

    + Revision 28 +

    +
      +
    • New value for the age attribute: 13.0. +
    • +
    • New values for the blk attribute: Chorasmian, CJK_Ext_G, + Dives_Akuru, Khitan_Small_Script, Lisu_Sup, + Symbols_For_Legacy_Computing, Tangut_Sup, Yezidi. +
    • +
    • New values for the script attribute: Chrs, Diak, + Kits, Yezi. +
    • +
    • New value for the InPC attribute: Top_And_Bottom_And_Left. +
    • +
    • New Unihan attributes kSpoofingVariant, kUnihanCore2020, + kIRG_SSource, kIRG_UKSource, kTGHZ2013. +
    • +
    • New Emoji attributes Emoji, EPres, EMod, + EBase, EComp, ExtPict. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, kKangXi, + kSemanticVariant, kSimplifiedVariant, + kSpecializedSemanticVariant, kTraditionalVariant attributes. +
    • +
    +
    +
    +

    Revision 27 being a proposed update, only changes between revisions 26 and 28 are + noted here. +

    +
    +
    +

    + Revision 26 +

    +
      +
    • New value for the age attribute: 12.1. +
    • +
    +
    +
    +

    + Revision 25 +

    +
      +
    • New value for the age attribute: 12.0. +
    • +
    • New values for the script attribute: Elym, Hmnp, + Nand, Wcho. +
    • +
    • New values for the blk attribute: + Egyptian_Hieroglyph_Format_Controls, Elymaic, Nandinagari, + Nyiakeng_Puachue_Hmong, Ottoman_Siyaq_Numbers, Small_Kana_Ext, + Symbols_And_Pictographs_Ext_A, Tamil_Sup, Wancho. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_KSource, + kIRG_TSource, kTaiwanTelegraph attributes. +
    • +
    +
    +
    +

    Revision 24 being a proposed update, only changes between revisions 23 and 25 are + noted here. +

    +
    +
    +

    + Revision 23 +

    +
      +
    • New value for the age attribute: 11.0. +
    • +
    • New values for the blk attribute: Chess_Symbols, + Dogra, Georgian_Ext, Gunjala_Gondi, + Hanifi_Rohingya, Indic_Siyaq_Numbers, Makasar, + Mayan_Numerals, Medefaidrin, Old_Sogdian, Sogdian. +
    • +
    • New values for the script attribute: Dogr, Gong, + Maka, Medf, Rohg, Sogd, Sogo. +
    • +
    • New values for the jg attribute: Hanifi_Rohingya_Kinna_Ya, + Hanifi_Rohingya_Pa. +
    • +
    • New value for the wb attribute: WSegSpace. +
    • +
    • New values for the InSC attribute: Consonant_Initial_Postfixed. +
    • +
    • New attributes: EqUIdeo, kJinmeiyoKanji, kJoyoKanji, + kKoreanEducationHanja, kKoreanName, kTGH. +
    • +
    • Modified patterns for the kTGT_MergedSrc attribute. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource and + kIRG_VSource + attributes. +
    • +
    +
    +
    +

    Revision 22 being a proposed update, only changes between revisions 21 and 23 are + noted here. +

    +
    +
    +

    + Revision 21 +

    +
      +
    • New value for the age attribute: 10.0. +
    • +
    • New values for the blk attribute: CJK_Ext_F, Kana_Ext_A, + Masaram_Gondi, Nushu, Soyombo, Syriac_Sup, + Zanabazar_Square. +
    • +
    • New values for the sc attribute: Gonm, Nshu, + Soyo, Zanb. +
    • +
    • New values for the jg attribute: Malayalam_Nga, + Malayalam_Ja, Malayalam_Nya, Malayalam_Tta, Malayalam_Nna, + Malayalam_Nnna, Malayalam_Bha, Malayalam_Ra, + Malayalam_Lla, Malayalam_Llla, Malayalam_Ssa. +
    • +
    • New value for the InPC attribute: Bottom_And_Left. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_JSource, + kIRG_KSource + attributes. +
    • +
    • New code point attributes: vo, + RI +
    • +
    • New code point attributes for Nushu data: kSrc_NushuDuben and + kReading. +
    • +
    +
    +
    +

    Revision 20 being a proposed update, only changes between revisions 19 and 21 are + noted here. +

    +
    +
    +

    + Revision 19 +

    +
      +
    • New value for the age attribute: 9.0. +
    • +
    • New values for the sc attribute: Adlm, Bhks, + Marc, Newa, Osge, Tang. +
    • +
    • New values for the blk attribute: Adlam, Bhaiksuki, + Cyrillic_Ext_C, Glagolitic_Sup, Ideographic_Symbols, + Marchen, Mongolian_Sup, Newa, Osage, + Tangut, Tangut_Components. +
    • +
    • New values for the gcb attribute: EB, EBG, EM, + GAZ, ZWJ. +
    • +
    • New values for the wb attribute: EB, EBG, EM, + GAZ, ZWJ. +
    • +
    • New values for the lb attribute: EB, EM, ZWJ. +
    • +
    • New values for the jg attribute: African_Feh, + African_Noon, African_Qaf. +
    • +
    • New code point attributes: PCM, kRSTUnicode and + kTGT_MergedSrc. +
    • +
    • Modified patterns for the kRSUnicode, kRSKangXi, + kMandarin, kIRG_JSource, kIRG_USource and kFennIndex + attributes. +
    • +
    +
    +
    +

    Revision 18 being a proposed update, only changes between revisions 17 and 19 are + noted here. +

    +
    +
    +

    + Revision 17 +

    +
      +
    • New value for the age attribute: 8.0. +
    • +
    • New values for the sc attribute: Ahom, Hatr, + Hluw, Hung, Mult, Sgnw. +
    • +
    • New values for the blk attribute: Ahom, + Anatolian_Hieroglyphs, Cherokee_Sup, CJK_Ext_E, + Early_Dynastic_Cuneiform, Hatran, Multani, Old_Hungarian, + Sup_Symbols_And_Pictographs, Sutton_SignWriting. +
    • +
    • New values for the InSC attribute: Consonant_Killer, + Consonant_Prefixed, Consonant_With_Stacker, Syllable_Modifier. +
    • +
    • New code point attributes: InPC, kJa. +
    • +
    • New patterns for the kIRG_GSource attribute: GFC-, GGFZ-. +
    • +
    • Switched the reference to ISO 19757 from :2003 and :2003 Amd1 to :2008.
    • +
    +
    +
    +

    Revision 16 being a proposed update, only changes between revisions 15 and 17 are + noted here. +

    +
    +
    +

    + Revision 15 +

    +
      +
    • New value for the age attribute: 7.0. +
    • +
    • New values for the jg attribute. +
    • +
    • New values for the sc attribute. +
    • +
    • New values for the blk attribute. +
    • +
    • New values for the InSC attribute. +
    • +
    • New values for the kIICore attribute. +
    • +
    • New values for the kIRG_GSource attribute. +
    • +
    +
    +
    +

    Revision 14 being a proposed update, only changes between revisions 13 and 15 are + noted here. +

    +
    +
    +

    + Revision 13 +

    +
      +
    • New value for the age attribute: 6.3. +
    • +
    • New values DQ, HL, SQ for the WB attribute(forUnicode6.3). +
    • +
    • New code point attributes bpt and bpb (for Unicode 6.3). +
    • +
    • New values for the bc attribute: LRI, RLI, FSI, + PDI + (for Unicode 6.3). +
    • +
    • Updated the patterns for kHanyuPinlu and kTotalStrokes (for + Unicode6.3). +
    • +
    • Updated the patterns for kIRG_HSource and kIRG_HSource (for + Unicode6.2). +
    • +
    • Clarified that the child elements list-like elements are in no particular order.
    • +
    +
    +
    +

    Revision 12 being a proposed update, only changes between revisions 11 and 13 are + noted here. +

    +
    +
    +

    + Revision 11 +

    +
      +
    • New value for the age attribute: 6.2. +
    • +
    • New value for the gcb, wb and lb attributes: + RI + (for Unicode 6.2). +
    • +
    • Updated the patterns for kIRG_GSource and kIRG_HSource (for + Unicode 6.2). +
    • +
    +
    +
    +

    Revision 10 being a proposed update, only changes between revisions 9 and 11 are + noted here. +

    +
    +
    +

    + Revision 9 +

    +
      +
    • Clarified the default values.
    • +
    • Indicate that property values may change from one release to the next.
    • +
    • Introduced the blk attributes, for the Block property. +
    • +
    • Introduced the scx attribute, for the ScriptExtensions property. +
    • +
    • Introduced the name-alias element, for the Name_Alias property. +
    • +
    • New value for the age attribute: 6.1. +
    • +
    • New values for the script attribute: Cakm, Merc, + Mero, Plrd, Shrd, Sora, Takr. +
    • +
    • New values for the lb attribute: HL and CJ. +
    • +
    • New value for the jg attribute: Rohingya_Yeh. +
    • +
    • The value of the fc_nfkc attribute must now be either # or + one-or-more-code-points. +
    • +
    • For the nv attribute, the absence of a numeric value is now represented by + NaN + rather than by the empty string. +
    • +
    • The values of the ccc are now restricted to 0..254, instead of 0..255. +
    • +
    • Updated the patterns for kSemanticVariant, + kSpecializedSemanticVariant, kIRG_USource, and kMandarin. +
    • +
    +
    +
    +

    Revision 8 being a proposed update, only changes between revisions 7 and 9 are noted + here. +

    +
    +
    +

    + Revision 7 +

    +
      +
    • New value for the age attribute: 6.0. +
    • +
    • New value for the jg attribute: + Teh_Marbuta_Goal +
    • +
    • New values for the script attribute: Batk, Brah, + Mand. +
    • +
    • Updated the patterns for kIRG_GSource, kIRG_HSource, + kIRG_JSource, kIRG_KSource, kIRG_MSource, + kIRG_TSource, kIRG_VSource. +
    • +
    • Added the InSC and InMC elements. +
    • +
    • Added the emoji-sources element. +
    • +
    +
    +
    +

    Revision 6 being a proposed update, only changes between revisions 5 and 7 are noted + here. +

    +
    +
    +

    + Revision 5 +

    +
      +
    • Changed the type of block/@first-cp, block/@last-cp and + normalization-corrections/@cp + from text to + single-code-point +
    • +
    • Changed the type of named-sequence/@cps, + provisional-named-sequences/@cps, normalization-correction/@old and + normalization-correction/@new + from text to one-or-more-code-points. +
    • +
    • Changed the type of standardized-variants/@cps from text to + two-code-points. +
    • +
    • New values for the jg attribute: Farsi_Yeh and Nya. +
    • +
    • New value for the age attribute: 5.2. +
    • +
    • New values for the sc attribute: Lana, Tavt, + Avst, Egyp, Samr, Lisu, Bamu, Java, + Mtei, Armi, Sarb, Prti, Phli, Orkh, + Kthi. +
    • +
    • New value for the lb attribute: CP. +
    • +
    • New value for the sc attribute: Zinh. +
    • +
    • New code point attributes CI, Cased, CWCF, + CWCM, CWL, CWKCF, CWT, CWU, + NFKC_CF. +
    • +
    • New attributes kHanyuPinyin and kIRG_MSource. +
    • +
    • New element + cjk-radicals +
    • +
    • Updated the patterns for kIRG_GSource, kIRG_JSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, + kIRG_VSource, kHanyuPinlu, kMandarin, + kSemanticVariant, kSpecializedSemanticVariant, + kVietnamese, kZVariant. +
    • +
    • Point out that Relax NG schemas do not modify or augment the infoset, and that it ispossible + to convert mechanically our schema to other schema languages. +
    • +
    +
    +
    +

    Revision 4 being a proposed update, only changes between revisions 3 and 5 are noted + here. +

    +
    +
    +

    + Revision 3 +

    +
      +
    • First approved version, for Unicode 5.1.0.
    • +
    • For optional elements which acts as collections, such as repertoire and + named-sequences, impose that there be at least one element in the collection. +
    • +
    • Remove the constraint that the value jg is limited when jt has + certainvalues; similarly for bmg / Bidi_M and for nv / + nt. +
    • +
    • Value NL added to the WB attribute (for Unicode 5.1). +
    • +
    • Value PP added to the GCB attribute (for Unicode 5.1). +
    • +
    • Corrected the Vai script value to Vaii. +
    • +
    • Removed the discussion of elements or attributes in different namespace.
    • +
    • Removed the code-point element. +
    • +
    +
    +
    +

    + Revision 2 +

    +
      +
    • Promoted to Draft UAX.
    • +
    • Changed the title from "An XML representation of the UCD"
    • +
    • Value 5.1 added to the age attribute (for Unicode 5.1). +
    • +
    • Value SM added to the gcb attribute (for Unicode 5.1). +
    • +
    • Values CR, Extend, LF, MB added to the + WB + attribute(forUnicode5.1). +
    • +
    • Values CR, EX, LF, SC added to the SB + attribute(forUnicode5.1). +
    • +
    • Value Burushaski_Yeh_Barree added to the jg attribute (for + Unicode5.1). +
    • +
    • Value Alef_Maqsurah added to the jg attribute (for Unicode 2.x). +
    • +
    • Values Cari, Cham, Kali, Lepc, + Lyci, Lydi, Olck, Rjng, Saur, Sund and + Vai + added to the sc attribute (forUnicode5.0). +
    • +
    • + jamo + attribute renamed to + JSN +
    • +
    • + sfc + attribute renamed to + scf +
    • +
    • Attribute kXHC1983 added (for Unicode 5.1.0). +
    • +
    • Pattern for attribute kIRG_USource extended (for Unicode 5.1.0). +
    • +
    • Element provisional-named-sequences added (for Unicode 5.0) +
    • +
    +
    +
    +

    + Revision 1 +

    +
      +
    • First working draft.
    • +
    +
    +
    + + + +
    + + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc b/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc new file mode 100644 index 0000000000..84d9b5875c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc @@ -0,0 +1,1455 @@ + + # Copyright © 2024 Unicode, Inc. + + + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + + + start = + element ucd { ucd.content } + + + boolean = "Y" | "N" + + + ucd.content &= + element description { text }? + + + ucd.content &= + element repertoire { (code-point | group) + }? + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + + group = + element group { + code-point-attributes, + code-point* } + + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + + code-point-attributes &= + attribute Bidi_M { boolean }? + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + + code-point-attributes &= + attribute Bidi_C { boolean }? + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + + code-point-attributes &= + attribute Join_C { boolean }? + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + + code-point-attributes &= + attribute isc { text }? + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/pom.xml b/unicodetools/src/main/resources/org/unicode/uax42/pom.xml new file mode 100644 index 0000000000..9ae81d56f9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/pom.xml @@ -0,0 +1,72 @@ + + + + 4.0.0 + + uax42 + Unicode Standard Annex #42 + + + + org.unicode.unicodetools + unicodetools-parent + 1.0.0 + + + + + + org.codehaus.mojo + xml-maven-plugin + 1.1.0 + + + + transform + + + + + + + ${project.basedir} + true + + index.xml + + index2html.xsl + ${outputdir} + + + .html + + + + + ${project.basedir} + true + + index.xml + + index2rnc.xsl + ${outputdir} + + + .rnc + + + + + + + + net.sf.saxon + Saxon-HE + 12.4 + + + + + + + From 66527bbdd0f562648fdfd831d733ea8ce6911eaf Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Fri, 7 Feb 2025 14:44:16 -0800 Subject: [PATCH 04/10] Review changes from Markus --- docs/ucdxml.md | 4 +- .../org/unicode/xml/AttributeResolver.java | 51 +- ...{CompareUcdXML.java => CompareUCDXML.java} | 11 +- .../unicode/xml/GeneratePropertyValues.java | 20 +- .../java/org/unicode/xml/UCDDataResolver.java | 17 +- ...ertyDetail.java => UCDPropertyDetail.java} | 1068 +++++++++-------- ...omponent.java => UCDSectionComponent.java} | 7 +- ...ctionDetail.java => UCDSectionDetail.java} | 87 +- .../unicode/xml/{UcdXML.java => UCDXML.java} | 208 ++-- .../java/org/unicode/xml/UCDXMLWriter.java | 9 +- .../java/org/unicode/xml/XMLProperties.java | 16 +- .../resources/org/unicode/uax42/index.xml | 4 +- 12 files changed, 795 insertions(+), 707 deletions(-) rename unicodetools/src/main/java/org/unicode/xml/{CompareUcdXML.java => CompareUCDXML.java} (96%) rename unicodetools/src/main/java/org/unicode/xml/{UcdPropertyDetail.java => UCDPropertyDetail.java} (70%) rename unicodetools/src/main/java/org/unicode/xml/{UcdSectionComponent.java => UCDSectionComponent.java} (75%) rename unicodetools/src/main/java/org/unicode/xml/{UcdSectionDetail.java => UCDSectionDetail.java} (72%) rename unicodetools/src/main/java/org/unicode/xml/{UcdXML.java => UCDXML.java} (85%) diff --git a/docs/ucdxml.md b/docs/ucdxml.md index 207842db2a..a8d1d1e954 100644 --- a/docs/ucdxml.md +++ b/docs/ucdxml.md @@ -10,8 +10,8 @@ ## Step 3 - Validate generated UAX XML files -You'll need a [RELAX NG](https://relaxng.org/) schema validator. We'll use [jing-trang](https://github. -com/relaxng/jing-trang) in this example. +You'll need a [RELAX NG](https://relaxng.org/) schema validator. +We'll use [jing-trang](https://github.com/relaxng/jing-trang) in this example. 1. Clone and build [jing-trang](https://github.com/relaxng/jing-trang) 2. Run the following: diff --git a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java index ccb4984ec7..38d05786d5 100644 --- a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java +++ b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java @@ -2,9 +2,24 @@ import com.ibm.icu.impl.UnicodeMap; import com.ibm.icu.util.VersionInfo; -import java.util.*; import org.unicode.cldr.draft.FileUtilities; -import org.unicode.props.*; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdLineParser; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.unicode.props.UnicodeProperty; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; + +/** + * Used by UCDXML to get string values of attributes for each code point from IndexUnicodeProperties. + */ public class AttributeResolver { @@ -19,14 +34,14 @@ public class AttributeResolver { // If there is a change in any of these properties between two adjacent characters, it will // result in a new range. - private final UcdPropertyDetail[] rangeDefiningPropertyDetails = { - UcdPropertyDetail.Age_Detail, - UcdPropertyDetail.Bidi_Class_Detail, - UcdPropertyDetail.Block_Detail, - UcdPropertyDetail.Decomposition_Mapping_Detail, - UcdPropertyDetail.Numeric_Type_Detail, - UcdPropertyDetail.Numeric_Value_Detail, - UcdPropertyDetail.Vertical_Orientation_Detail + private final UCDPropertyDetail[] rangeDefiningPropertyDetails = { + UCDPropertyDetail.Age_Detail, + UCDPropertyDetail.Bidi_Class_Detail, + UCDPropertyDetail.Block_Detail, + UCDPropertyDetail.Decomposition_Mapping_Detail, + UCDPropertyDetail.Numeric_Type_Detail, + UCDPropertyDetail.Numeric_Value_Detail, + UCDPropertyDetail.Vertical_Orientation_Detail }; public AttributeResolver(IndexUnicodeProperties iup) { @@ -93,7 +108,7 @@ public int compare(NameAlias o1, NameAlias o2) { } private HashMap> loadNameAliases() { - HashMap> nameAliasesByCodepoint = new HashMap<>(); + HashMap> nameAliasesByCodePoint = new HashMap<>(); final PropertyParsingInfo fileInfo = PropertyParsingInfo.getPropertyInfo(UcdProperty.Name_Alias); String fullFilename = fileInfo.getFullFileName(indexUnicodeProperties.getUcdVersion()); @@ -112,17 +127,17 @@ private HashMap> loadNameAliases() { parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT))); } - if (nameAliasesByCodepoint.containsKey(codepoint)) { + if (nameAliasesByCodePoint.containsKey(codepoint)) { LinkedList nameAliases = - new LinkedList<>(nameAliasesByCodepoint.get(codepoint)); + new LinkedList<>(nameAliasesByCodePoint.get(codepoint)); nameAliases.add(nameAlias); nameAliases.sort(nameAliasComparator); - nameAliasesByCodepoint.replace(codepoint, nameAliases); + nameAliasesByCodePoint.replace(codepoint, nameAliases); } else { - nameAliasesByCodepoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); + nameAliasesByCodePoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); } } - return nameAliasesByCodepoint; + return nameAliasesByCodePoint; } public String getAttributeValue(UcdProperty prop, int codepoint) { @@ -254,7 +269,7 @@ public String getAttributeValue(UcdProperty prop, int codepoint) { } } - public boolean isUnassignedCodepoint(int codepoint) { + public boolean isUnassignedCodePoint(int codepoint) { return UcdPropertyValues.General_Category_Values.Unassigned.equals(getgc(codepoint)) || UcdPropertyValues.General_Category_Values.Private_Use.equals(getgc(codepoint)) || UcdPropertyValues.General_Category_Values.Surrogate.equals(getgc(codepoint)); @@ -300,7 +315,7 @@ private String getMappingValue( public boolean isDifferentRange(VersionInfo ucdVersion, int codepointA, int codepointB) { boolean isDifference = false; - for (UcdPropertyDetail propDetail : rangeDefiningPropertyDetails) { + for (UCDPropertyDetail propDetail : rangeDefiningPropertyDetails) { UcdProperty prop = propDetail.getUcdProperty(); if (ucdVersion.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null diff --git a/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java similarity index 96% rename from unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java rename to unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java index 52120eda45..d2876c1229 100644 --- a/unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java @@ -3,12 +3,19 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.impl.UnicodeMap; import com.ibm.icu.text.UnicodeSet; -import java.io.*; + +import java.io.File; +import java.io.IOException; import java.util.HashMap; import java.util.Objects; import org.unicode.props.UcdProperty; -public class CompareUcdXML { +/** + * Utility for comparing two UCDXML files. + * Originally intended to compare UCDXML files generated using https://github.com/eric-muller/ucdxml to UCDXML files + * generated using org.unicode.xml.UCDXML. + */ +public class CompareUCDXML { private static final String NEWLINE = System.getProperty("line.separator"); private static final UOption[] options = { diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java index f8a0dfa279..03f10a428b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -2,17 +2,31 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.util.VersionInfo; -import java.io.*; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.unicode.props.PropertyParsingInfo; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.*; +/** + * Utility for generating fragments that describe the property values in a format that can be displayed in UAX42. + * UAX42 fragments live in unicodetools/src/main/resources/org/unicode/uax42/fragments + */ public class GeneratePropertyValues { private enum VALUESOUTPUTTYPE { @@ -669,7 +683,7 @@ private static String getFormattedTR38Syntax(UcdProperty ucdProperty) { // TODO: We should determine whether we still want to show empty values in the XML files. // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() boolean isShowIfEmpty = false; - for (UcdPropertyDetail propDetail : UcdPropertyDetail.cjkValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.cjkValues()) { if (propDetail.getUcdProperty().equals(ucdProperty)) { isShowIfEmpty = propDetail.isCJKShowIfEmpty(); } diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java index a30067bbb6..d30693e838 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java @@ -1,7 +1,6 @@ package org.unicode.xml; import com.ibm.icu.util.VersionInfo; -import java.util.*; import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyParsingInfo; @@ -9,6 +8,14 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; + +/** + * Helper class for building sections of UCDXML files based on IndexUnicodeProperties values. + */ public class UCDDataResolver { private final IndexUnicodeProperties indexUnicodeProperties; @@ -21,20 +28,20 @@ public UCDDataResolver(IndexUnicodeProperties iup, String namespace, UCDXMLWrite this.writer = writer; } - public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXException { + public void buildSection(UCDSectionDetail.UcdSection ucdSection) throws SAXException { VersionInfo minVersion = ucdSection.getMinVersion(); VersionInfo maxVersion = ucdSection.getMaxVersion(); String tag = ucdSection.toString(); String childTag = ucdSection.getChildTag(); boolean parserWithRange = ucdSection.getParserWithRange(); boolean parserWithMissing = ucdSection.getParserWithMissing(); - UcdSectionComponent[] ucdSectionComponents = + UCDSectionComponent[] ucdSectionComponents = ucdSection.getUcdSectionDetail().getUcdSectionComponents(); if (isCompatibleVersion(minVersion, maxVersion)) { writer.startElement(tag); { - for (UcdSectionComponent ucdSectionComponent : ucdSectionComponents) { + for (UCDSectionComponent ucdSectionComponent : ucdSectionComponents) { if (isCompatibleVersion( ucdSectionComponent.getMinVersion(), ucdSectionComponent.getMaxVersion())) { @@ -115,7 +122,7 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep } private AttributesImpl getAttributes( - UcdSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { + UCDSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { switch (ucdSection) { case CJKRADICALS: return getCJKRadicalAttributes(namespace, line); diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java similarity index 70% rename from unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java rename to unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java index a97ef5bab9..39192fd36b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdPropertyDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java @@ -5,22 +5,26 @@ import java.util.Set; import org.unicode.props.UcdProperty; -public class UcdPropertyDetail { +/** + * Helper class for determining how and when UCD properties should be shown in UCDXML. Also includes information + * about when a UCDProperty was added to Unicode. + */ +public class UCDPropertyDetail { - private static LinkedHashSet basePropertyDetails = - new LinkedHashSet(); - private static LinkedHashSet cjkPropertyDetails = - new LinkedHashSet(); - private static LinkedHashSet ucdxmlPropertyDetails = - new LinkedHashSet(); - private static LinkedHashSet allPropertyDetails = - new LinkedHashSet(); + private static LinkedHashSet basePropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet cjkPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet ucdxmlPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet allPropertyDetails = + new LinkedHashSet(); - public static UcdPropertyDetail Age_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Age_Detail = + new UCDPropertyDetail( UcdProperty.Age, VersionInfo.getInstance(3, 2, 0), 1, true, false, false, true); - public static UcdPropertyDetail Name_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Name_Detail = + new UCDPropertyDetail( UcdProperty.Name, VersionInfo.getInstance(1, 1, 0), 2, @@ -28,8 +32,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Jamo_Short_Name_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Jamo_Short_Name_Detail = + new UCDPropertyDetail( UcdProperty.Jamo_Short_Name, VersionInfo.getInstance(5, 1, 0), 3, @@ -37,8 +41,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail General_Category_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail General_Category_Detail = + new UCDPropertyDetail( UcdProperty.General_Category, VersionInfo.getInstance(1, 1, 0), 4, @@ -46,8 +50,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Canonical_Combining_Class_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Canonical_Combining_Class_Detail = + new UCDPropertyDetail( UcdProperty.Canonical_Combining_Class, VersionInfo.getInstance(1, 1, 0), 5, @@ -55,8 +59,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Decomposition_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Decomposition_Type_Detail = + new UCDPropertyDetail( UcdProperty.Decomposition_Type, VersionInfo.getInstance(1, 1, 0), 6, @@ -64,8 +68,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Decomposition_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Decomposition_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Decomposition_Mapping, VersionInfo.getInstance(1, 1, 0), 7, @@ -73,8 +77,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Numeric_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Numeric_Type_Detail = + new UCDPropertyDetail( UcdProperty.Numeric_Type, VersionInfo.getInstance(1, 1, 0), 8, @@ -82,8 +86,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Numeric_Value_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Numeric_Value_Detail = + new UCDPropertyDetail( UcdProperty.Numeric_Value, VersionInfo.getInstance(1, 1, 0), 9, @@ -91,8 +95,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Class_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Class_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Class, VersionInfo.getInstance(1, 1, 0), 10, @@ -100,8 +104,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Paired_Bracket_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Paired_Bracket_Type_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Paired_Bracket_Type, VersionInfo.getInstance(6, 3, 0), 11, @@ -109,8 +113,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Paired_Bracket_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Paired_Bracket_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Paired_Bracket, VersionInfo.getInstance(6, 3, 0), 12, @@ -118,8 +122,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Mirrored_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Mirrored_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Mirrored, VersionInfo.getInstance(1, 1, 0), 13, @@ -127,8 +131,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Mirroring_Glyph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Mirroring_Glyph_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Mirroring_Glyph, VersionInfo.getInstance(3, 0, 1), 14, @@ -136,8 +140,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Simple_Uppercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Uppercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Uppercase_Mapping, VersionInfo.getInstance(1, 1, 0), 15, @@ -145,8 +149,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Simple_Lowercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Lowercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Lowercase_Mapping, VersionInfo.getInstance(1, 1, 0), 16, @@ -154,8 +158,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Simple_Titlecase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Titlecase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Titlecase_Mapping, VersionInfo.getInstance(1, 1, 0), 17, @@ -163,8 +167,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Uppercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Uppercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Uppercase_Mapping, VersionInfo.getInstance(2, 1, 8), 18, @@ -172,8 +176,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Lowercase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Lowercase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Lowercase_Mapping, VersionInfo.getInstance(2, 1, 8), 19, @@ -181,8 +185,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Titlecase_Mapping_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Titlecase_Mapping_Detail = + new UCDPropertyDetail( UcdProperty.Titlecase_Mapping, VersionInfo.getInstance(2, 1, 8), 20, @@ -194,8 +198,8 @@ public class UcdPropertyDetail { // ( // UcdProperty.Special_Case_Condition, VersionInfo.getInstance(1,1,0), 21, // true, false, false, true); - public static UcdPropertyDetail Simple_Case_Folding_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Simple_Case_Folding_Detail = + new UCDPropertyDetail( UcdProperty.Simple_Case_Folding, VersionInfo.getInstance(3, 0, 1), 22, @@ -203,8 +207,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Case_Folding_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Case_Folding_Detail = + new UCDPropertyDetail( UcdProperty.Case_Folding, VersionInfo.getInstance(3, 0, 1), 23, @@ -212,8 +216,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Joining_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Joining_Type_Detail = + new UCDPropertyDetail( UcdProperty.Joining_Type, VersionInfo.getInstance(2, 0, 0), 24, @@ -221,8 +225,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Joining_Group_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Joining_Group_Detail = + new UCDPropertyDetail( UcdProperty.Joining_Group, VersionInfo.getInstance(2, 0, 0), 25, @@ -230,8 +234,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail East_Asian_Width_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail East_Asian_Width_Detail = + new UCDPropertyDetail( UcdProperty.East_Asian_Width, VersionInfo.getInstance(3, 0, 0), 26, @@ -239,8 +243,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Line_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Line_Break_Detail = + new UCDPropertyDetail( UcdProperty.Line_Break, VersionInfo.getInstance(3, 0, 0), 27, @@ -248,8 +252,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Script_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Script_Detail = + new UCDPropertyDetail( UcdProperty.Script, VersionInfo.getInstance(3, 1, 0), 28, @@ -257,8 +261,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Script_Extensions_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Script_Extensions_Detail = + new UCDPropertyDetail( UcdProperty.Script_Extensions, VersionInfo.getInstance(6, 1, 0), 29, @@ -266,8 +270,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Dash_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Dash_Detail = + new UCDPropertyDetail( UcdProperty.Dash, VersionInfo.getInstance(2, 0, 0), 30, @@ -275,8 +279,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail White_Space_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail White_Space_Detail = + new UCDPropertyDetail( UcdProperty.White_Space, VersionInfo.getInstance(2, 0, 0), 31, @@ -284,8 +288,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Hyphen_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Hyphen_Detail = + new UCDPropertyDetail( UcdProperty.Hyphen, VersionInfo.getInstance(2, 0, 0), 32, @@ -293,8 +297,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Quotation_Mark_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Quotation_Mark_Detail = + new UCDPropertyDetail( UcdProperty.Quotation_Mark, VersionInfo.getInstance(2, 0, 0), 33, @@ -302,8 +306,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Radical_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Radical_Detail = + new UCDPropertyDetail( UcdProperty.Radical, VersionInfo.getInstance(3, 2, 0), 34, @@ -311,8 +315,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Ideographic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Ideographic_Detail = + new UCDPropertyDetail( UcdProperty.Ideographic, VersionInfo.getInstance(2, 0, 0), 35, @@ -320,8 +324,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Unified_Ideograph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Unified_Ideograph_Detail = + new UCDPropertyDetail( UcdProperty.Unified_Ideograph, VersionInfo.getInstance(3, 2, 0), 36, @@ -329,8 +333,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail IDS_Binary_Operator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail IDS_Binary_Operator_Detail = + new UCDPropertyDetail( UcdProperty.IDS_Binary_Operator, VersionInfo.getInstance(3, 2, 0), 37, @@ -338,8 +342,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail IDS_Trinary_Operator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail IDS_Trinary_Operator_Detail = + new UCDPropertyDetail( UcdProperty.IDS_Trinary_Operator, VersionInfo.getInstance(3, 2, 0), 38, @@ -347,8 +351,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Hangul_Syllable_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Hangul_Syllable_Type_Detail = + new UCDPropertyDetail( UcdProperty.Hangul_Syllable_Type, VersionInfo.getInstance(4, 0, 0), 39, @@ -356,8 +360,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Default_Ignorable_Code_Point_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Default_Ignorable_Code_Point_Detail = + new UCDPropertyDetail( UcdProperty.Default_Ignorable_Code_Point, VersionInfo.getInstance(3, 2, 0), 40, @@ -365,8 +369,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Default_Ignorable_Code_Point_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Default_Ignorable_Code_Point_Detail = + new UCDPropertyDetail( UcdProperty.Other_Default_Ignorable_Code_Point, VersionInfo.getInstance(3, 2, 0), 41, @@ -374,8 +378,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Alphabetic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Alphabetic_Detail = + new UCDPropertyDetail( UcdProperty.Alphabetic, VersionInfo.getInstance(1, 1, 0), 42, @@ -383,8 +387,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Alphabetic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Alphabetic_Detail = + new UCDPropertyDetail( UcdProperty.Other_Alphabetic, VersionInfo.getInstance(3, 1, 0), 43, @@ -392,8 +396,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Uppercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Uppercase_Detail = + new UCDPropertyDetail( UcdProperty.Uppercase, VersionInfo.getInstance(3, 1, 0), 44, @@ -401,8 +405,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Uppercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Uppercase_Detail = + new UCDPropertyDetail( UcdProperty.Other_Uppercase, VersionInfo.getInstance(3, 1, 0), 45, @@ -410,8 +414,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Lowercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Lowercase_Detail = + new UCDPropertyDetail( UcdProperty.Lowercase, VersionInfo.getInstance(3, 1, 0), 46, @@ -419,8 +423,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Lowercase_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Lowercase_Detail = + new UCDPropertyDetail( UcdProperty.Other_Lowercase, VersionInfo.getInstance(3, 1, 0), 47, @@ -428,8 +432,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Math_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Math_Detail = + new UCDPropertyDetail( UcdProperty.Math, VersionInfo.getInstance(2, 0, 0), 48, @@ -437,8 +441,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Math_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Math_Detail = + new UCDPropertyDetail( UcdProperty.Other_Math, VersionInfo.getInstance(3, 1, 0), 49, @@ -446,8 +450,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Hex_Digit_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Hex_Digit_Detail = + new UCDPropertyDetail( UcdProperty.Hex_Digit, VersionInfo.getInstance(2, 0, 0), 50, @@ -455,8 +459,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ASCII_Hex_Digit_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ASCII_Hex_Digit_Detail = + new UCDPropertyDetail( UcdProperty.ASCII_Hex_Digit, VersionInfo.getInstance(3, 1, 1), 51, @@ -464,8 +468,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Noncharacter_Code_Point_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Noncharacter_Code_Point_Detail = + new UCDPropertyDetail( UcdProperty.Noncharacter_Code_Point, VersionInfo.getInstance(3, 0, 1), 52, @@ -473,8 +477,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Variation_Selector_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Variation_Selector_Detail = + new UCDPropertyDetail( UcdProperty.Variation_Selector, VersionInfo.getInstance(4, 0, 1), 53, @@ -482,8 +486,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Bidi_Control_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Bidi_Control_Detail = + new UCDPropertyDetail( UcdProperty.Bidi_Control, VersionInfo.getInstance(2, 0, 0), 54, @@ -491,8 +495,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Join_Control_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Join_Control_Detail = + new UCDPropertyDetail( UcdProperty.Join_Control, VersionInfo.getInstance(2, 0, 0), 55, @@ -500,8 +504,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Base_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Base_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Base, VersionInfo.getInstance(3, 2, 0), 56, @@ -509,8 +513,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Extend_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Extend_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Extend, VersionInfo.getInstance(3, 2, 0), 57, @@ -518,8 +522,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_Grapheme_Extend_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Grapheme_Extend_Detail = + new UCDPropertyDetail( UcdProperty.Other_Grapheme_Extend, VersionInfo.getInstance(3, 2, 0), 58, @@ -527,8 +531,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Link_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Link_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Link, VersionInfo.getInstance(3, 2, 0), 59, @@ -536,8 +540,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Sentence_Terminal_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Sentence_Terminal_Detail = + new UCDPropertyDetail( UcdProperty.Sentence_Terminal, VersionInfo.getInstance(9, 0, 0), 60, @@ -545,8 +549,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Extender_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Extender_Detail = + new UCDPropertyDetail( UcdProperty.Extender, VersionInfo.getInstance(2, 0, 0), 61, @@ -554,8 +558,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Terminal_Punctuation_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Terminal_Punctuation_Detail = + new UCDPropertyDetail( UcdProperty.Terminal_Punctuation, VersionInfo.getInstance(2, 0, 0), 62, @@ -563,8 +567,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Diacritic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Diacritic_Detail = + new UCDPropertyDetail( UcdProperty.Diacritic, VersionInfo.getInstance(2, 0, 0), 63, @@ -572,8 +576,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Deprecated_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Deprecated_Detail = + new UCDPropertyDetail( UcdProperty.Deprecated, VersionInfo.getInstance(3, 2, 0), 64, @@ -581,8 +585,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Start_Detail = + new UCDPropertyDetail( UcdProperty.ID_Start, VersionInfo.getInstance(3, 1, 0), 65, @@ -590,8 +594,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_ID_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_ID_Start_Detail = + new UCDPropertyDetail( UcdProperty.Other_ID_Start, VersionInfo.getInstance(4, 0, 0), 66, @@ -599,8 +603,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail XID_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail XID_Start_Detail = + new UCDPropertyDetail( UcdProperty.XID_Start, VersionInfo.getInstance(3, 1, 0), 67, @@ -608,8 +612,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Continue_Detail = + new UCDPropertyDetail( UcdProperty.ID_Continue, VersionInfo.getInstance(3, 1, 0), 68, @@ -617,8 +621,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Other_ID_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_ID_Continue_Detail = + new UCDPropertyDetail( UcdProperty.Other_ID_Continue, VersionInfo.getInstance(4, 1, 0), 69, @@ -626,8 +630,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail XID_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail XID_Continue_Detail = + new UCDPropertyDetail( UcdProperty.XID_Continue, VersionInfo.getInstance(3, 1, 0), 70, @@ -635,8 +639,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Soft_Dotted_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Soft_Dotted_Detail = + new UCDPropertyDetail( UcdProperty.Soft_Dotted, VersionInfo.getInstance(3, 2, 0), 71, @@ -644,8 +648,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Logical_Order_Exception_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Logical_Order_Exception_Detail = + new UCDPropertyDetail( UcdProperty.Logical_Order_Exception, VersionInfo.getInstance(3, 2, 0), 72, @@ -653,8 +657,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Pattern_White_Space_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Pattern_White_Space_Detail = + new UCDPropertyDetail( UcdProperty.Pattern_White_Space, VersionInfo.getInstance(4, 1, 0), 73, @@ -662,8 +666,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Pattern_Syntax_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Pattern_Syntax_Detail = + new UCDPropertyDetail( UcdProperty.Pattern_Syntax, VersionInfo.getInstance(4, 1, 0), 74, @@ -671,8 +675,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Grapheme_Cluster_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Grapheme_Cluster_Break_Detail = + new UCDPropertyDetail( UcdProperty.Grapheme_Cluster_Break, VersionInfo.getInstance(4, 1, 0), 75, @@ -680,8 +684,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Word_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Word_Break_Detail = + new UCDPropertyDetail( UcdProperty.Word_Break, VersionInfo.getInstance(4, 1, 0), 76, @@ -689,8 +693,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Sentence_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Sentence_Break_Detail = + new UCDPropertyDetail( UcdProperty.Sentence_Break, VersionInfo.getInstance(4, 1, 0), 77, @@ -698,8 +702,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Composition_Exclusion_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Composition_Exclusion_Detail = + new UCDPropertyDetail( UcdProperty.Composition_Exclusion, VersionInfo.getInstance(3, 0, 0), 78, @@ -707,8 +711,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Full_Composition_Exclusion_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Full_Composition_Exclusion_Detail = + new UCDPropertyDetail( UcdProperty.Full_Composition_Exclusion, VersionInfo.getInstance(3, 1, 0), 79, @@ -716,8 +720,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFC_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFC_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFC_Quick_Check, VersionInfo.getInstance(3, 2, 0), 80, @@ -725,8 +729,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFD_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFD_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFD_Quick_Check, VersionInfo.getInstance(3, 2, 0), 81, @@ -734,8 +738,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFKC_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKC_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFKC_Quick_Check, VersionInfo.getInstance(5, 2, 0), 82, @@ -743,8 +747,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFKD_Quick_Check_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKD_Quick_Check_Detail = + new UCDPropertyDetail( UcdProperty.NFKD_Quick_Check, VersionInfo.getInstance(3, 2, 0), 83, @@ -752,8 +756,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFC_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFC_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFC, VersionInfo.getInstance(3, 2, 0), 84, @@ -761,8 +765,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFD_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFD_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFD, VersionInfo.getInstance(3, 2, 0), 85, @@ -770,8 +774,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFKC_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFKC_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFKC, VersionInfo.getInstance(3, 2, 0), 86, @@ -779,8 +783,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Expands_On_NFKD_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Expands_On_NFKD_Detail = + new UCDPropertyDetail( UcdProperty.Expands_On_NFKD, VersionInfo.getInstance(3, 2, 0), 87, @@ -788,8 +792,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail FC_NFC_Closure_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail FC_NFC_Closure_Detail = + new UCDPropertyDetail( UcdProperty.FC_NFKC_Closure, VersionInfo.getInstance(3, 1, 0), 88, @@ -797,8 +801,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Case_Ignorable_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Case_Ignorable_Detail = + new UCDPropertyDetail( UcdProperty.Case_Ignorable, VersionInfo.getInstance(5, 2, 0), 89, @@ -806,8 +810,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Cased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Cased_Detail = + new UCDPropertyDetail( UcdProperty.Cased, VersionInfo.getInstance(5, 2, 0), 90, @@ -815,8 +819,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_CaseFolded_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_CaseFolded_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Casefolded, VersionInfo.getInstance(5, 2, 0), 91, @@ -824,8 +828,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_CaseMapped_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_CaseMapped_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Casemapped, VersionInfo.getInstance(5, 2, 0), 92, @@ -833,8 +837,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_NFKC_Casefolded_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_NFKC_Casefolded_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_NFKC_Casefolded, VersionInfo.getInstance(5, 2, 0), 93, @@ -842,8 +846,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_Lowercased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_Lowercased_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Lowercased, VersionInfo.getInstance(5, 2, 0), 94, @@ -851,8 +855,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_Titlecased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_Titlecased_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Titlecased, VersionInfo.getInstance(5, 2, 0), 95, @@ -860,8 +864,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Changes_When_Uppercased_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Changes_When_Uppercased_Detail = + new UCDPropertyDetail( UcdProperty.Changes_When_Uppercased, VersionInfo.getInstance(5, 2, 0), 96, @@ -869,8 +873,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail NFKC_Casefold_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKC_Casefold_Detail = + new UCDPropertyDetail( UcdProperty.NFKC_Casefold, VersionInfo.getInstance(5, 2, 0), 97, @@ -878,8 +882,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Indic_Syllabic_Category_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Indic_Syllabic_Category_Detail = + new UCDPropertyDetail( UcdProperty.Indic_Syllabic_Category, VersionInfo.getInstance(6, 1, 0), 98, @@ -891,8 +895,8 @@ public class UcdPropertyDetail { // UcdProperty.Indic_Matra_Category, VersionInfo.getInstance(6,1,0), // VersionInfo.getInstance(7,0,0), 99, // true, false, false, true); - public static UcdPropertyDetail Indic_Positional_Category_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Indic_Positional_Category_Detail = + new UCDPropertyDetail( UcdProperty.Indic_Positional_Category, VersionInfo.getInstance(8, 0, 0), 100, @@ -900,8 +904,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kJa_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJa_Detail = + new UCDPropertyDetail( UcdProperty.kJa, VersionInfo.getInstance(8, 0, 0), 101, @@ -909,8 +913,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail Prepended_Concatenation_Mark_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Prepended_Concatenation_Mark_Detail = + new UCDPropertyDetail( UcdProperty.Prepended_Concatenation_Mark, VersionInfo.getInstance(9, 0, 0), 102, @@ -918,8 +922,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Vertical_Orientation_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Vertical_Orientation_Detail = + new UCDPropertyDetail( UcdProperty.Vertical_Orientation, VersionInfo.getInstance(10, 0, 0), 103, @@ -927,8 +931,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Regional_Indicator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Regional_Indicator_Detail = + new UCDPropertyDetail( UcdProperty.Regional_Indicator, VersionInfo.getInstance(10, 0, 0), 104, @@ -936,8 +940,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Block_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Block_Detail = + new UCDPropertyDetail( UcdProperty.Block, VersionInfo.getInstance(2, 0, 0), 105, @@ -945,8 +949,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Equivalent_Unified_Ideograph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Equivalent_Unified_Ideograph_Detail = + new UCDPropertyDetail( UcdProperty.Equivalent_Unified_Ideograph, VersionInfo.getInstance(11, 0, 0), 106, @@ -954,8 +958,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCompatibilityVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCompatibilityVariant_Detail = + new UCDPropertyDetail( UcdProperty.kCompatibilityVariant, VersionInfo.getInstance(3, 2, 0), 107, @@ -963,8 +967,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kRSUnicode_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSUnicode_Detail = + new UCDPropertyDetail( UcdProperty.kRSUnicode, VersionInfo.getInstance(2, 0, 0), 108, @@ -975,8 +979,8 @@ public class UcdPropertyDetail { // public static UcdPropertyDetail kIRG_RSIndex_Detail = new UcdPropertyDetail ( // UcdProperty.kIRG_RSIndex, VersionInfo.getInstance(11,0,0), 109, // false, true, false, true); - public static UcdPropertyDetail kIRG_GSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_GSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_GSource, VersionInfo.getInstance(3, 0, 0), 110, @@ -984,8 +988,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_TSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_TSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_TSource, VersionInfo.getInstance(3, 0, 0), 111, @@ -993,8 +997,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_JSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_JSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_JSource, VersionInfo.getInstance(3, 0, 0), 112, @@ -1002,8 +1006,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_KSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_KSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_KSource, VersionInfo.getInstance(3, 0, 0), 113, @@ -1011,8 +1015,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_KPSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_KPSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_KPSource, VersionInfo.getInstance(3, 1, 1), 114, @@ -1020,8 +1024,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_VSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_VSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_VSource, VersionInfo.getInstance(3, 0, 0), 115, @@ -1029,8 +1033,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_HSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_HSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_HSource, VersionInfo.getInstance(3, 1, 0), 116, @@ -1038,8 +1042,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_USource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_USource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_USource, VersionInfo.getInstance(4, 0, 1), 117, @@ -1047,8 +1051,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_MSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_MSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_MSource, VersionInfo.getInstance(5, 2, 0), 118, @@ -1056,8 +1060,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_UKSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_UKSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_UKSource, VersionInfo.getInstance(13, 0, 0), 119, @@ -1065,8 +1069,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIRG_SSource_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRG_SSource_Detail = + new UCDPropertyDetail( UcdProperty.kIRG_SSource, VersionInfo.getInstance(13, 0, 0), 120, @@ -1074,8 +1078,8 @@ public class UcdPropertyDetail { true, true, true); - public static UcdPropertyDetail kIICore_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIICore_Detail = + new UCDPropertyDetail( UcdProperty.kIICore, VersionInfo.getInstance(4, 1, 0), 121, @@ -1083,8 +1087,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kUnihanCore2020_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kUnihanCore2020_Detail = + new UCDPropertyDetail( UcdProperty.kUnihanCore2020, VersionInfo.getInstance(13, 0, 0), 122, @@ -1092,8 +1096,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB0_Detail = + new UCDPropertyDetail( UcdProperty.kGB0, VersionInfo.getInstance(2, 0, 0), 123, @@ -1101,8 +1105,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB1_Detail = + new UCDPropertyDetail( UcdProperty.kGB1, VersionInfo.getInstance(2, 0, 0), 124, @@ -1110,8 +1114,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB3_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB3_Detail = + new UCDPropertyDetail( UcdProperty.kGB3, VersionInfo.getInstance(2, 0, 0), 125, @@ -1119,8 +1123,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB5_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB5_Detail = + new UCDPropertyDetail( UcdProperty.kGB5, VersionInfo.getInstance(2, 0, 0), 126, @@ -1128,8 +1132,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB7_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB7_Detail = + new UCDPropertyDetail( UcdProperty.kGB7, VersionInfo.getInstance(2, 0, 0), 127, @@ -1137,8 +1141,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGB8_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGB8_Detail = + new UCDPropertyDetail( UcdProperty.kGB8, VersionInfo.getInstance(2, 0, 0), 128, @@ -1146,8 +1150,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCNS1986_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCNS1986_Detail = + new UCDPropertyDetail( UcdProperty.kCNS1986, VersionInfo.getInstance(2, 0, 0), 129, @@ -1155,8 +1159,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCNS1992_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCNS1992_Detail = + new UCDPropertyDetail( UcdProperty.kCNS1992, VersionInfo.getInstance(2, 0, 0), 130, @@ -1164,8 +1168,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJis0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJis0_Detail = + new UCDPropertyDetail( UcdProperty.kJis0, VersionInfo.getInstance(2, 0, 0), 131, @@ -1173,8 +1177,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJis1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJis1_Detail = + new UCDPropertyDetail( UcdProperty.kJis1, VersionInfo.getInstance(2, 0, 0), 132, @@ -1182,8 +1186,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJIS0213_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJIS0213_Detail = + new UCDPropertyDetail( UcdProperty.kJIS0213, VersionInfo.getInstance(3, 1, 1), 133, @@ -1191,8 +1195,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKSC0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKSC0_Detail = + new UCDPropertyDetail( UcdProperty.kKSC0, VersionInfo.getInstance(2, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1201,8 +1205,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKSC1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKSC1_Detail = + new UCDPropertyDetail( UcdProperty.kKSC1, VersionInfo.getInstance(2, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1211,8 +1215,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKPS0_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKPS0_Detail = + new UCDPropertyDetail( UcdProperty.kKPS0, VersionInfo.getInstance(3, 1, 1), VersionInfo.getInstance(15, 1, 0), @@ -1221,8 +1225,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKPS1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKPS1_Detail = + new UCDPropertyDetail( UcdProperty.kKPS1, VersionInfo.getInstance(3, 1, 1), VersionInfo.getInstance(15, 1, 0), @@ -1231,8 +1235,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHKSCS_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHKSCS_Detail = + new UCDPropertyDetail( UcdProperty.kHKSCS, VersionInfo.getInstance(3, 1, 1), VersionInfo.getInstance(15, 1, 0), @@ -1241,8 +1245,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCantonese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCantonese_Detail = + new UCDPropertyDetail( UcdProperty.kCantonese, VersionInfo.getInstance(2, 0, 0), 139, @@ -1250,8 +1254,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHangul_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHangul_Detail = + new UCDPropertyDetail( UcdProperty.kHangul, VersionInfo.getInstance(5, 0, 0), 140, @@ -1259,8 +1263,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kDefinition_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kDefinition_Detail = + new UCDPropertyDetail( UcdProperty.kDefinition, VersionInfo.getInstance(2, 0, 0), 141, @@ -1268,8 +1272,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHanYu_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHanYu_Detail = + new UCDPropertyDetail( UcdProperty.kHanYu, VersionInfo.getInstance(2, 0, 0), 142, @@ -1281,8 +1285,8 @@ public class UcdPropertyDetail { // UcdProperty.kAlternateHanYu, VersionInfo.getInstance(2,0,0), // VersionInfo.getInstance(3,1,1), 143, // false, true, false, true); - public static UcdPropertyDetail kMandarin_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMandarin_Detail = + new UCDPropertyDetail( UcdProperty.kMandarin, VersionInfo.getInstance(2, 0, 0), 144, @@ -1290,8 +1294,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCihaiT_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCihaiT_Detail = + new UCDPropertyDetail( UcdProperty.kCihaiT, VersionInfo.getInstance(3, 2, 0), 145, @@ -1299,8 +1303,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSBGY_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSBGY_Detail = + new UCDPropertyDetail( UcdProperty.kSBGY, VersionInfo.getInstance(3, 2, 0), 146, @@ -1308,8 +1312,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kNelson_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kNelson_Detail = + new UCDPropertyDetail( UcdProperty.kNelson, VersionInfo.getInstance(2, 0, 0), 147, @@ -1317,8 +1321,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCowles_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCowles_Detail = + new UCDPropertyDetail( UcdProperty.kCowles, VersionInfo.getInstance(3, 1, 1), 148, @@ -1326,8 +1330,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMatthews_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMatthews_Detail = + new UCDPropertyDetail( UcdProperty.kMatthews, VersionInfo.getInstance(2, 0, 0), 149, @@ -1335,8 +1339,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kOtherNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kOtherNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kOtherNumeric, VersionInfo.getInstance(3, 2, 0), 150, @@ -1344,8 +1348,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kPhonetic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kPhonetic_Detail = + new UCDPropertyDetail( UcdProperty.kPhonetic, VersionInfo.getInstance(3, 1, 0), 151, @@ -1353,8 +1357,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGSR_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGSR_Detail = + new UCDPropertyDetail( UcdProperty.kGSR, VersionInfo.getInstance(4, 0, 1), 152, @@ -1362,8 +1366,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFenn_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFenn_Detail = + new UCDPropertyDetail( UcdProperty.kFenn, VersionInfo.getInstance(3, 1, 1), 153, @@ -1371,8 +1375,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFennIndex_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFennIndex_Detail = + new UCDPropertyDetail( UcdProperty.kFennIndex, VersionInfo.getInstance(4, 1, 0), 154, @@ -1380,8 +1384,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKarlgren_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKarlgren_Detail = + new UCDPropertyDetail( UcdProperty.kKarlgren, VersionInfo.getInstance(3, 1, 1), 155, @@ -1389,8 +1393,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCangjie_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCangjie_Detail = + new UCDPropertyDetail( UcdProperty.kCangjie, VersionInfo.getInstance(3, 1, 1), 156, @@ -1398,8 +1402,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMeyerWempe_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMeyerWempe_Detail = + new UCDPropertyDetail( UcdProperty.kMeyerWempe, VersionInfo.getInstance(3, 1, 0), 157, @@ -1407,8 +1411,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSimplifiedVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSimplifiedVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSimplifiedVariant, VersionInfo.getInstance(2, 0, 0), 158, @@ -1416,8 +1420,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTraditionalVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTraditionalVariant_Detail = + new UCDPropertyDetail( UcdProperty.kTraditionalVariant, VersionInfo.getInstance(2, 0, 0), 159, @@ -1425,8 +1429,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSpecializedSemanticVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSpecializedSemanticVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSpecializedSemanticVariant, VersionInfo.getInstance(2, 0, 0), 160, @@ -1434,8 +1438,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSemanticVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSemanticVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSemanticVariant, VersionInfo.getInstance(2, 0, 0), 161, @@ -1443,8 +1447,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kVietnamese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kVietnamese_Detail = + new UCDPropertyDetail( UcdProperty.kVietnamese, VersionInfo.getInstance(3, 1, 1), 162, @@ -1452,8 +1456,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kLau_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kLau_Detail = + new UCDPropertyDetail( UcdProperty.kLau, VersionInfo.getInstance(3, 1, 1), 163, @@ -1461,8 +1465,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTang_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTang_Detail = + new UCDPropertyDetail( UcdProperty.kTang, VersionInfo.getInstance(2, 0, 0), 164, @@ -1470,8 +1474,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kZVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kZVariant_Detail = + new UCDPropertyDetail( UcdProperty.kZVariant, VersionInfo.getInstance(2, 0, 0), 165, @@ -1479,8 +1483,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJapaneseKun_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJapaneseKun_Detail = + new UCDPropertyDetail( UcdProperty.kJapaneseKun, VersionInfo.getInstance(2, 0, 0), 166, @@ -1488,8 +1492,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJapaneseOn_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJapaneseOn_Detail = + new UCDPropertyDetail( UcdProperty.kJapaneseOn, VersionInfo.getInstance(2, 0, 0), 167, @@ -1497,8 +1501,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKangXi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKangXi_Detail = + new UCDPropertyDetail( UcdProperty.kKangXi, VersionInfo.getInstance(2, 0, 0), 168, @@ -1510,8 +1514,8 @@ public class UcdPropertyDetail { // UcdProperty.kAlternateKangXi, VersionInfo.getInstance(2,0,0), // VersionInfo.getInstance(4,0,1), 169, // false, true, false, true); - public static UcdPropertyDetail kBigFive_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kBigFive_Detail = + new UCDPropertyDetail( UcdProperty.kBigFive, VersionInfo.getInstance(2, 0, 0), 170, @@ -1519,8 +1523,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCCCII_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCCCII_Detail = + new UCDPropertyDetail( UcdProperty.kCCCII, VersionInfo.getInstance(2, 0, 0), 171, @@ -1528,8 +1532,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kDaeJaweon_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kDaeJaweon_Detail = + new UCDPropertyDetail( UcdProperty.kDaeJaweon, VersionInfo.getInstance(2, 0, 0), 172, @@ -1537,8 +1541,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kEACC_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kEACC_Detail = + new UCDPropertyDetail( UcdProperty.kEACC, VersionInfo.getInstance(2, 0, 0), 173, @@ -1546,8 +1550,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFrequency_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFrequency_Detail = + new UCDPropertyDetail( UcdProperty.kFrequency, VersionInfo.getInstance(3, 2, 0), VersionInfo.getInstance(16, 0, 0), @@ -1556,8 +1560,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kGradeLevel_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kGradeLevel_Detail = + new UCDPropertyDetail( UcdProperty.kGradeLevel, VersionInfo.getInstance(3, 2, 0), 175, @@ -1565,8 +1569,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHDZRadBreak_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHDZRadBreak_Detail = + new UCDPropertyDetail( UcdProperty.kHDZRadBreak, VersionInfo.getInstance(4, 1, 0), 176, @@ -1574,8 +1578,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHKGlyph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHKGlyph_Detail = + new UCDPropertyDetail( UcdProperty.kHKGlyph, VersionInfo.getInstance(3, 1, 1), 177, @@ -1583,8 +1587,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHanyuPinlu_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHanyuPinlu_Detail = + new UCDPropertyDetail( UcdProperty.kHanyuPinlu, VersionInfo.getInstance(4, 0, 1), 178, @@ -1592,8 +1596,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kHanyuPinyin_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kHanyuPinyin_Detail = + new UCDPropertyDetail( UcdProperty.kHanyuPinyin, VersionInfo.getInstance(5, 2, 0), 179, @@ -1601,8 +1605,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGHanyuDaZidian_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGHanyuDaZidian_Detail = + new UCDPropertyDetail( UcdProperty.kIRGHanyuDaZidian, VersionInfo.getInstance(3, 0, 0), 180, @@ -1610,8 +1614,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGKangXi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGKangXi_Detail = + new UCDPropertyDetail( UcdProperty.kIRGKangXi, VersionInfo.getInstance(3, 0, 0), 181, @@ -1619,8 +1623,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGDaeJaweon_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGDaeJaweon_Detail = + new UCDPropertyDetail( UcdProperty.kIRGDaeJaweon, VersionInfo.getInstance(3, 0, 0), 182, @@ -1628,8 +1632,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIRGDaiKanwaZiten_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIRGDaiKanwaZiten_Detail = + new UCDPropertyDetail( UcdProperty.kIRGDaiKanwaZiten, VersionInfo.getInstance(3, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1638,8 +1642,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKorean_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKorean_Detail = + new UCDPropertyDetail( UcdProperty.kKorean, VersionInfo.getInstance(2, 0, 0), 184, @@ -1647,8 +1651,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMainlandTelegraph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMainlandTelegraph_Detail = + new UCDPropertyDetail( UcdProperty.kMainlandTelegraph, VersionInfo.getInstance(2, 0, 0), 185, @@ -1656,8 +1660,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMorohashi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMorohashi_Detail = + new UCDPropertyDetail( UcdProperty.kMorohashi, VersionInfo.getInstance(2, 0, 0), 186, @@ -1669,8 +1673,8 @@ public class UcdPropertyDetail { // UcdProperty.kAlternateMorohashi, VersionInfo.getInstance(2,0,0), // VersionInfo.getInstance(4,0,1), 187, // false, true, false, true); - public static UcdPropertyDetail kPrimaryNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kPrimaryNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kPrimaryNumeric, VersionInfo.getInstance(3, 2, 0), 188, @@ -1678,8 +1682,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTaiwanTelegraph_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTaiwanTelegraph_Detail = + new UCDPropertyDetail( UcdProperty.kTaiwanTelegraph, VersionInfo.getInstance(2, 0, 0), 189, @@ -1687,8 +1691,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kXerox_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kXerox_Detail = + new UCDPropertyDetail( UcdProperty.kXerox, VersionInfo.getInstance(2, 0, 0), 190, @@ -1696,8 +1700,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kPseudoGB1_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kPseudoGB1_Detail = + new UCDPropertyDetail( UcdProperty.kPseudoGB1, VersionInfo.getInstance(2, 0, 0), 191, @@ -1705,8 +1709,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kIBMJapan_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kIBMJapan_Detail = + new UCDPropertyDetail( UcdProperty.kIBMJapan, VersionInfo.getInstance(2, 0, 0), 192, @@ -1714,8 +1718,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kAccountingNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kAccountingNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kAccountingNumeric, VersionInfo.getInstance(3, 2, 0), 193, @@ -1723,8 +1727,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCheungBauer_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCheungBauer_Detail = + new UCDPropertyDetail( UcdProperty.kCheungBauer, VersionInfo.getInstance(5, 0, 0), 194, @@ -1732,8 +1736,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kCheungBauerIndex_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kCheungBauerIndex_Detail = + new UCDPropertyDetail( UcdProperty.kCheungBauerIndex, VersionInfo.getInstance(5, 0, 0), 195, @@ -1741,8 +1745,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kFourCornerCode_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFourCornerCode_Detail = + new UCDPropertyDetail( UcdProperty.kFourCornerCode, VersionInfo.getInstance(5, 0, 0), 196, @@ -1753,8 +1757,8 @@ public class UcdPropertyDetail { // public static UcdPropertyDetail kWubi_Detail = new UcdPropertyDetail ( // UcdProperty.kWubi, VersionInfo.getInstance(11,0,0), 197, // false, true, false, true); - public static UcdPropertyDetail kXHC1983_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kXHC1983_Detail = + new UCDPropertyDetail( UcdProperty.kXHC1983, VersionInfo.getInstance(5, 1, 0), 198, @@ -1762,8 +1766,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJinmeiyoKanji_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJinmeiyoKanji_Detail = + new UCDPropertyDetail( UcdProperty.kJinmeiyoKanji, VersionInfo.getInstance(11, 0, 0), 199, @@ -1771,8 +1775,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kJoyoKanji_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJoyoKanji_Detail = + new UCDPropertyDetail( UcdProperty.kJoyoKanji, VersionInfo.getInstance(11, 0, 0), 200, @@ -1780,8 +1784,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKoreanEducationHanja_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKoreanEducationHanja_Detail = + new UCDPropertyDetail( UcdProperty.kKoreanEducationHanja, VersionInfo.getInstance(11, 0, 0), 201, @@ -1789,8 +1793,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kKoreanName_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kKoreanName_Detail = + new UCDPropertyDetail( UcdProperty.kKoreanName, VersionInfo.getInstance(11, 0, 0), 202, @@ -1798,8 +1802,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTGH_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTGH_Detail = + new UCDPropertyDetail( UcdProperty.kTGH, VersionInfo.getInstance(11, 0, 0), 203, @@ -1807,8 +1811,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTGHZ2013_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTGHZ2013_Detail = + new UCDPropertyDetail( UcdProperty.kTGHZ2013, VersionInfo.getInstance(13, 0, 0), 204, @@ -1816,8 +1820,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSpoofingVariant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSpoofingVariant_Detail = + new UCDPropertyDetail( UcdProperty.kSpoofingVariant, VersionInfo.getInstance(13, 0, 0), 205, @@ -1825,8 +1829,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSKanWa_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSKanWa_Detail = + new UCDPropertyDetail( UcdProperty.kRSKanWa, VersionInfo.getInstance(2, 0, 0), 206, @@ -1834,8 +1838,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSJapanese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSJapanese_Detail = + new UCDPropertyDetail( UcdProperty.kRSJapanese, VersionInfo.getInstance(2, 0, 0), 207, @@ -1843,8 +1847,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSKorean_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSKorean_Detail = + new UCDPropertyDetail( UcdProperty.kRSKorean, VersionInfo.getInstance(2, 0, 0), 208, @@ -1852,8 +1856,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSKangXi_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSKangXi_Detail = + new UCDPropertyDetail( UcdProperty.kRSKangXi, VersionInfo.getInstance(2, 0, 0), VersionInfo.getInstance(15, 1, 0), @@ -1862,8 +1866,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSAdobe_Japan1_6_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSAdobe_Japan1_6_Detail = + new UCDPropertyDetail( UcdProperty.kRSAdobe_Japan1_6, VersionInfo.getInstance(4, 1, 0), 210, @@ -1871,8 +1875,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTotalStrokes_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTotalStrokes_Detail = + new UCDPropertyDetail( UcdProperty.kTotalStrokes, VersionInfo.getInstance(3, 1, 0), 211, @@ -1880,8 +1884,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kRSTUnicode_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kRSTUnicode_Detail = + new UCDPropertyDetail( UcdProperty.kRSTUnicode, VersionInfo.getInstance(9, 0, 0), 212, @@ -1889,8 +1893,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kTGT_MergedSrc_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kTGT_MergedSrc_Detail = + new UCDPropertyDetail( UcdProperty.kTGT_MergedSrc, VersionInfo.getInstance(9, 0, 0), 213, @@ -1898,8 +1902,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSrc_NushuDuben_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSrc_NushuDuben_Detail = + new UCDPropertyDetail( UcdProperty.kSrc_NushuDuben, VersionInfo.getInstance(10, 0, 0), 214, @@ -1907,8 +1911,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kReading_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kReading_Detail = + new UCDPropertyDetail( UcdProperty.kReading, VersionInfo.getInstance(10, 0, 0), 215, @@ -1916,8 +1920,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail ISO_Comment_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ISO_Comment_Detail = + new UCDPropertyDetail( UcdProperty.ISO_Comment, VersionInfo.getInstance(11, 0, 0), 216, @@ -1925,8 +1929,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Unicode_1_Name_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Unicode_1_Name_Detail = + new UCDPropertyDetail( UcdProperty.Unicode_1_Name, VersionInfo.getInstance(2, 0, 0), 217, @@ -1934,8 +1938,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Name_Alias_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Name_Alias_Detail = + new UCDPropertyDetail( UcdProperty.Name_Alias, VersionInfo.getInstance(5, 0, 0), 218, @@ -1943,8 +1947,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Detail = + new UCDPropertyDetail( UcdProperty.Emoji, VersionInfo.getInstance(13, 0, 0), 219, @@ -1952,8 +1956,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Presentation_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Presentation_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Presentation, VersionInfo.getInstance(13, 0, 0), 220, @@ -1961,8 +1965,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Modifier_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Modifier_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Modifier, VersionInfo.getInstance(13, 0, 0), 221, @@ -1970,8 +1974,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Modifier_Base_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Modifier_Base_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Modifier_Base, VersionInfo.getInstance(13, 0, 0), 222, @@ -1979,8 +1983,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Emoji_Component_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_Component_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_Component, VersionInfo.getInstance(13, 0, 0), 223, @@ -1988,8 +1992,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Extended_Pictographic_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Extended_Pictographic_Detail = + new UCDPropertyDetail( UcdProperty.Extended_Pictographic, VersionInfo.getInstance(13, 0, 0), 224, @@ -1997,8 +2001,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kStrange_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kStrange_Detail = + new UCDPropertyDetail( UcdProperty.kStrange, VersionInfo.getInstance(14, 0, 0), 225, @@ -2006,8 +2010,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kAlternateTotalStrokes_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kAlternateTotalStrokes_Detail = + new UCDPropertyDetail( UcdProperty.kAlternateTotalStrokes, VersionInfo.getInstance(15, 0, 0), 226, @@ -2015,8 +2019,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail NFKC_Simple_Casefold_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail NFKC_Simple_Casefold_Detail = + new UCDPropertyDetail( UcdProperty.NFKC_Simple_Casefold, VersionInfo.getInstance(15, 1, 0), 227, @@ -2024,8 +2028,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Compat_Math_Start_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Compat_Math_Start_Detail = + new UCDPropertyDetail( UcdProperty.ID_Compat_Math_Start, VersionInfo.getInstance(15, 1, 0), 228, @@ -2033,8 +2037,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail ID_Compat_Math_Continue_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail ID_Compat_Math_Continue_Detail = + new UCDPropertyDetail( UcdProperty.ID_Compat_Math_Continue, VersionInfo.getInstance(15, 1, 0), 229, @@ -2042,8 +2046,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail IDS_Unary_Operator_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail IDS_Unary_Operator_Detail = + new UCDPropertyDetail( UcdProperty.IDS_Unary_Operator, VersionInfo.getInstance(15, 1, 0), 230, @@ -2051,8 +2055,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kJapanese_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kJapanese_Detail = + new UCDPropertyDetail( UcdProperty.kJapanese, VersionInfo.getInstance(15, 1, 0), 231, @@ -2060,8 +2064,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kMojiJoho_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kMojiJoho_Detail = + new UCDPropertyDetail( UcdProperty.kMojiJoho, VersionInfo.getInstance(15, 1, 0), 232, @@ -2069,8 +2073,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSMSZD2003Index_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSMSZD2003Index_Detail = + new UCDPropertyDetail( UcdProperty.kSMSZD2003Index, VersionInfo.getInstance(15, 1, 0), 233, @@ -2078,8 +2082,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kSMSZD2003Readings_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kSMSZD2003Readings_Detail = + new UCDPropertyDetail( UcdProperty.kSMSZD2003Readings, VersionInfo.getInstance(15, 1, 0), 234, @@ -2087,8 +2091,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kVietnameseNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kVietnameseNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kVietnameseNumeric, VersionInfo.getInstance(15, 1, 0), 235, @@ -2096,8 +2100,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kZhuangNumeric_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kZhuangNumeric_Detail = + new UCDPropertyDetail( UcdProperty.kZhuangNumeric, VersionInfo.getInstance(15, 1, 0), 236, @@ -2105,8 +2109,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail Indic_Conjunct_Break_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Indic_Conjunct_Break_Detail = + new UCDPropertyDetail( UcdProperty.Indic_Conjunct_Break, VersionInfo.getInstance(15, 1, 0), 237, @@ -2114,8 +2118,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail Modifier_Combining_Mark_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Modifier_Combining_Mark_Detail = + new UCDPropertyDetail( UcdProperty.Modifier_Combining_Mark, VersionInfo.getInstance(16, 0, 0), 238, @@ -2123,8 +2127,8 @@ public class UcdPropertyDetail { false, false, true); - public static UcdPropertyDetail kFanqie_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kFanqie_Detail = + new UCDPropertyDetail( UcdProperty.kFanqie, VersionInfo.getInstance(16, 0, 0), 239, @@ -2132,8 +2136,8 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail kZhuang_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail kZhuang_Detail = + new UCDPropertyDetail( UcdProperty.kZhuang, VersionInfo.getInstance(16, 0, 0), 240, @@ -2141,25 +2145,25 @@ public class UcdPropertyDetail { true, false, true); - public static UcdPropertyDetail Basic_Emoji_Detail = - new UcdPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); - public static UcdPropertyDetail CJK_Radical_Detail = - new UcdPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); - public static UcdPropertyDetail Confusable_MA_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); - public static UcdPropertyDetail Confusable_ML_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); - public static UcdPropertyDetail Confusable_SA_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); - public static UcdPropertyDetail Confusable_SL_Detail = - new UcdPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); - public static UcdPropertyDetail Do_Not_Emit_Preferred_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Basic_Emoji_Detail = + new UCDPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); + public static UCDPropertyDetail CJK_Radical_Detail = + new UCDPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); + public static UCDPropertyDetail Confusable_MA_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); + public static UCDPropertyDetail Confusable_ML_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); + public static UCDPropertyDetail Confusable_SA_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); + public static UCDPropertyDetail Confusable_SL_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); + public static UCDPropertyDetail Do_Not_Emit_Preferred_Detail = + new UCDPropertyDetail( UcdProperty.Do_Not_Emit_Preferred, -7, false, false, false, false); - public static UcdPropertyDetail Do_Not_Emit_Type_Detail = - new UcdPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); - public static UcdPropertyDetail Emoji_DCM_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Do_Not_Emit_Type_Detail = + new UCDPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); + public static UCDPropertyDetail Emoji_DCM_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_DCM, VersionInfo.getInstance(6, 0, 0), -9, @@ -2167,8 +2171,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Emoji_KDDI_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_KDDI_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_KDDI, VersionInfo.getInstance(6, 0, 0), -10, @@ -2176,8 +2180,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Emoji_SB_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Emoji_SB_Detail = + new UCDPropertyDetail( UcdProperty.Emoji_SB, VersionInfo.getInstance(6, 0, 0), -11, @@ -2185,8 +2189,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Identifier_Status_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Identifier_Status_Detail = + new UCDPropertyDetail( UcdProperty.Identifier_Status, VersionInfo.getInstance(9, 0, 0), -12, @@ -2194,8 +2198,8 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Identifier_Type_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Identifier_Type_Detail = + new UCDPropertyDetail( UcdProperty.Identifier_Type, VersionInfo.getInstance(9, 0, 0), -13, @@ -2203,36 +2207,36 @@ public class UcdPropertyDetail { false, false, false); - public static UcdPropertyDetail Idn_2008_Detail = - new UcdPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); - public static UcdPropertyDetail Idn_Mapping_Detail = - new UcdPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); - public static UcdPropertyDetail Idn_Status_Detail = - new UcdPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); - public static UcdPropertyDetail Named_Sequences_Detail = - new UcdPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); - public static UcdPropertyDetail Named_Sequences_Prov_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Idn_2008_Detail = + new UCDPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); + public static UCDPropertyDetail Idn_Mapping_Detail = + new UCDPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); + public static UCDPropertyDetail Idn_Status_Detail = + new UCDPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); + public static UCDPropertyDetail Named_Sequences_Detail = + new UCDPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); + public static UCDPropertyDetail Named_Sequences_Prov_Detail = + new UCDPropertyDetail( UcdProperty.Named_Sequences_Prov, -18, false, false, false, false); - public static UcdPropertyDetail Other_Joining_Type_Detail = - new UcdPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Flag_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Other_Joining_Type_Detail = + new UCDPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Flag_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Flag_Sequence, -20, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Keycap_Sequence, -21, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Modifier_Sequence, -22, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Tag_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Tag_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Tag_Sequence, -23, false, false, false, false); - public static UcdPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = + new UCDPropertyDetail( UcdProperty.RGI_Emoji_Zwj_Sequence, -24, false, false, false, false); - public static UcdPropertyDetail Standardized_Variant_Detail = - new UcdPropertyDetail( + public static UCDPropertyDetail Standardized_Variant_Detail = + new UCDPropertyDetail( UcdProperty.Standardized_Variant, -25, false, false, false, false); private UcdProperty ucdProperty; @@ -2244,7 +2248,7 @@ public class UcdPropertyDetail { private boolean isCJKShowIfEmpty; private boolean isOrgUCDXMLAttribute; - private UcdPropertyDetail( + private UCDPropertyDetail( UcdProperty ucdProperty, VersionInfo minVersion, int sortOrder, @@ -2263,7 +2267,7 @@ private UcdPropertyDetail( isOrgUCDXMLAttribute); } - private UcdPropertyDetail( + private UCDPropertyDetail( UcdProperty ucdProperty, int sortOrder, boolean isBaseAttribute, @@ -2281,7 +2285,7 @@ private UcdPropertyDetail( isOrgUCDXMLAttribute); } - private UcdPropertyDetail( + private UCDPropertyDetail( UcdProperty ucdProperty, VersionInfo minVersion, VersionInfo maxVersion, @@ -2310,19 +2314,19 @@ private UcdPropertyDetail( } } - public static Set values() { + public static Set values() { return allPropertyDetails; } - public static Set baseValues() { + public static Set baseValues() { return basePropertyDetails; } - public static Set cjkValues() { + public static Set cjkValues() { return cjkPropertyDetails; } - public static Set ucdxmlValues() { + public static Set ucdxmlValues() { return ucdxmlPropertyDetails; } diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java similarity index 75% rename from unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java rename to unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java index 0773486ccf..550fcbbaf7 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdSectionComponent.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java @@ -3,12 +3,15 @@ import com.ibm.icu.util.VersionInfo; import org.unicode.props.UcdProperty; -public class UcdSectionComponent { +/** + * Helper class that defines an object that stores the version range of a given UcdProperty. + */ +public class UCDSectionComponent { private final VersionInfo minVersion; private final VersionInfo maxVersion; private final UcdProperty ucdProperty; - UcdSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { + UCDSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { this.minVersion = minVersion; this.maxVersion = maxVersion; this.ucdProperty = ucdProperty; diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java similarity index 72% rename from unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java rename to unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java index ceed693afd..ac84a5a414 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdSectionDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java @@ -3,7 +3,12 @@ import com.ibm.icu.util.VersionInfo; import org.unicode.props.UcdProperty; -public class UcdSectionDetail { +/** + * Helper class that defines an object that stores information about a section of the UCDXML file. + * Information includes the section name, the type of elements that the section contains, and the version range of + * the section. + */ +public class UCDSectionDetail { public enum UcdSection { BLOCKS( @@ -74,7 +79,7 @@ public enum UcdSection { private final String childTag; private final VersionInfo minVersion; private final VersionInfo maxVersion; - private final UcdSectionDetail ucdSectionDetail; + private final UCDSectionDetail ucdSectionDetail; private final boolean parserWithRange; private final boolean parserWithMissing; @@ -83,7 +88,7 @@ public enum UcdSection { String childTag, VersionInfo minVersion, VersionInfo maxVersion, - UcdSectionDetail ucdSectionDetail, + UCDSectionDetail ucdSectionDetail, boolean parserWithRange, boolean parserWithMissing) { this.tag = tag; @@ -111,7 +116,7 @@ public VersionInfo getMaxVersion() { return maxVersion; } - public UcdSectionDetail getUcdSectionDetail() { + public UCDSectionDetail getUcdSectionDetail() { return ucdSectionDetail; } @@ -124,75 +129,75 @@ public boolean getParserWithMissing() { } } - public static UcdSectionDetail Blocks_Detail = - new UcdSectionDetail( + public static UCDSectionDetail Blocks_Detail = + new UCDSectionDetail( UcdSection.BLOCKS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Block) }, 0); - public static UcdSectionDetail NamedSequences_Detail = - new UcdSectionDetail( + public static UCDSectionDetail NamedSequences_Detail = + new UCDSectionDetail( UcdSection.NAMEDSEQUENCES, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Named_Sequences) }, 1); - public static UcdSectionDetail ProvisionalNamedSequences_Detail = - new UcdSectionDetail( + public static UCDSectionDetail ProvisionalNamedSequences_Detail = + new UCDSectionDetail( UcdSection.PROVISIONALNAMEDSEQUENCES, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(5, 0, 0), VersionInfo.getInstance(13, 0, 0), UcdProperty.Named_Sequences_Prov) }, 1); - public static UcdSectionDetail NormalizationCorrections_Detail = - new UcdSectionDetail( + public static UCDSectionDetail NormalizationCorrections_Detail = + new UCDSectionDetail( UcdSection.NORMALIZATIONCORRECTIONS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.NC_Original) }, 2); - public static UcdSectionDetail StandardizedVariants_Detail = - new UcdSectionDetail( + public static UCDSectionDetail StandardizedVariants_Detail = + new UCDSectionDetail( UcdSection.STANDARDIZEDVARIANTS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Standardized_Variant), - new UcdSectionComponent( + new UCDSectionComponent( VersionInfo.getInstance(13, 0, 0), null, UcdProperty.emoji_variation_sequence) }, 3); - public static UcdSectionDetail CJKRadicals_Detail = - new UcdSectionDetail( + public static UCDSectionDetail CJKRadicals_Detail = + new UCDSectionDetail( UcdSection.CJKRADICALS, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.CJK_Radical) }, 4); - public static UcdSectionDetail EmojiSources_Detail = - new UcdSectionDetail( + public static UCDSectionDetail EmojiSources_Detail = + new UCDSectionDetail( UcdSection.EMOJISOURCES, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Emoji_DCM) }, 5); - public static UcdSectionDetail DoNotEmit_Detail = - new UcdSectionDetail( + public static UCDSectionDetail DoNotEmit_Detail = + new UCDSectionDetail( UcdSection.DONOTEMIT, - new UcdSectionComponent[] { - new UcdSectionComponent( + new UCDSectionComponent[] { + new UCDSectionComponent( VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Do_Not_Emit_Type) @@ -200,11 +205,11 @@ public boolean getParserWithMissing() { 6); private final UcdSection ucdSection; - private final UcdSectionComponent[] ucdSectionComponents; + private final UCDSectionComponent[] ucdSectionComponents; private final int sortOrder; - private UcdSectionDetail( - UcdSection ucdSection, UcdSectionComponent[] ucdSectionComponents, int sortOrder) { + private UCDSectionDetail( + UcdSection ucdSection, UCDSectionComponent[] ucdSectionComponents, int sortOrder) { this.ucdSection = ucdSection; this.ucdSectionComponents = ucdSectionComponents; this.sortOrder = sortOrder; @@ -214,7 +219,7 @@ public UcdSection getSection() { return this.ucdSection; } - public UcdSectionComponent[] getUcdSectionComponents() { + public UCDSectionComponent[] getUcdSectionComponents() { return this.ucdSectionComponents; } diff --git a/unicodetools/src/main/java/org/unicode/xml/UcdXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java similarity index 85% rename from unicodetools/src/main/java/org/unicode/xml/UcdXML.java rename to unicodetools/src/main/java/org/unicode/xml/UCDXML.java index c71ac10826..a07be9c21f 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UcdXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -2,9 +2,22 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.util.VersionInfo; -import java.io.*; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.transform.TransformerConfigurationException; @@ -14,7 +27,12 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -public class UcdXML { +/** + * Utility for generating UCDXML files. + * The utility can build flat or grouped versions of UCDXML for non-Unihan code points, Unihan code points, or the + * complete range of code points. + */ +public class UCDXML { private static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; @@ -72,7 +90,7 @@ public static void main(String[] args) throws Exception { if (options[HELP].doesOccur) { System.out.println( - "UcdXML --ucdversion {version number} --outputfolder {destination} " + "UCDXML --ucdversion {version number} --outputfolder {destination} " + "--range [ALL|NOUNIHAN|UNIHAN] --output [FLAT|GROUPED]"); System.exit(0); } @@ -173,11 +191,11 @@ private static void buildUcdXMLFile( UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType) throws IOException, TransformerConfigurationException, SAXException { - int lowCodepoint = 0x0; - int highCodepoint = 0x10FFFF; + int lowCodePoint = 0x0; + int highCodePoint = 0x10FFFF; // Tangut - // int lowCodepoint = 0x17000; - // int highCodepoint = 0x1B2FB; + // int lowCodePoint = 0x17000; + // int highCodePoint = 0x1B2FB; // 0x10FFFF File tempFile = new File(destinationFolder, "temp.xml"); @@ -208,24 +226,24 @@ private static void buildUcdXMLFile( writer, attributeResolver, ucdVersion, - lowCodepoint, - highCodepoint, + lowCodePoint, + highCodePoint, outputRange, outputType); if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.BLOCKS); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NAMEDSEQUENCES); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.STANDARDIZEDVARIANTS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.BLOCKS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.NAMEDSEQUENCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.STANDARDIZEDVARIANTS); if (ucdVersion.compareTo(VersionInfo.getInstance(5, 2, 0)) >= 0) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.CJKRADICALS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.CJKRADICALS); } if (ucdVersion.compareTo(VersionInfo.getInstance(6, 0, 0)) >= 0) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.EMOJISOURCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.EMOJISOURCES); } if (ucdVersion.compareTo(VersionInfo.getInstance(16, 0, 0)) >= 0) { - ucdDataResolver.buildSection(UcdSectionDetail.UcdSection.DONOTEMIT); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.DONOTEMIT); } } writer.endElement("ucd"); @@ -274,34 +292,34 @@ private static void buildRepertoire( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType) throws SAXException { writer.startElement("repertoire"); { - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (isWritableCodePoint(CodePoint, outputRange, attributeResolver)) { if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { - codepoint = + CodePoint = buildGroup( writer, attributeResolver, ucdVersion, - codepoint, - highCodepoint, + CodePoint, + highCodePoint, outputRange, outputType); } else { - codepoint = + CodePoint = buildChars( writer, attributeResolver, ucdVersion, - codepoint, - highCodepoint, + CodePoint, + highCodePoint, outputRange, outputType, null); @@ -316,21 +334,21 @@ private static int buildGroup( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType) throws SAXException { - int lastCodepointInGroup = - getLastCodepointInGroup(attributeResolver, lowCodepoint, highCodepoint); + int lastCodePointInGroup = + getLastCodePointInGroup(attributeResolver, lowCodePoint, highCodePoint); AttributesImpl groupAttrs = getGroupAttributes( ucdVersion, attributeResolver, - lowCodepoint, - lastCodepointInGroup, + lowCodePoint, + lastCodePointInGroup, outputRange); writer.startElement("group", groupAttrs); @@ -339,22 +357,22 @@ private static int buildGroup( writer, attributeResolver, ucdVersion, - lowCodepoint, - lastCodepointInGroup, + lowCodePoint, + lastCodePointInGroup, outputRange, outputType, groupAttrs); writer.endElement("group"); } - return lastCodepointInGroup; + return lastCodePointInGroup; } private static int buildChars( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange, UCDXMLOUTPUTTYPE outputType, AttributesImpl groupAttrs) @@ -362,15 +380,15 @@ private static int buildChars( ArrayList range = new ArrayList<>(); Range rangeType = Range.NONRANGE; - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (attributeResolver.isUnassignedCodepoint(codepoint) + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (attributeResolver.isUnassignedCodePoint(CodePoint) || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN - && attributeResolver.isUnifiedIdeograph(codepoint))) { - Range currentRangeType = getRangeType(attributeResolver, codepoint); + && attributeResolver.isUnifiedIdeograph(CodePoint))) { + Range currentRangeType = getRangeType(attributeResolver, CodePoint); if (!range.isEmpty()) { if (!currentRangeType.equals(rangeType) || attributeResolver.isDifferentRange( - ucdVersion, codepoint, codepoint - 1)) { + ucdVersion, CodePoint, CodePoint - 1)) { if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { buildGroupedRange( @@ -388,7 +406,7 @@ private static int buildChars( range.clear(); } } - range.add(codepoint); + range.add(CodePoint); rangeType = currentRangeType; } else { if (!range.isEmpty()) { @@ -409,18 +427,18 @@ private static int buildChars( range.clear(); rangeType = Range.NONRANGE; } - if (isWritableCodepoint(codepoint, outputRange, attributeResolver)) { + if (isWritableCodePoint(CodePoint, outputRange, attributeResolver)) { if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { buildGroupedChar( writer, attributeResolver, ucdVersion, - codepoint, + CodePoint, outputRange, groupAttrs); } else { buildUngroupedChar( - writer, attributeResolver, ucdVersion, codepoint, outputRange); + writer, attributeResolver, ucdVersion, CodePoint, outputRange); } } } @@ -436,38 +454,38 @@ private static int buildChars( } } } - return highCodepoint; + return highCodePoint; } private static void buildUngroupedChar( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int codepoint, + int CodePoint, UCDXMLOUTPUTRANGE outputRange) throws SAXException { AttributesImpl charAttributes = - getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); - buildChar(writer, attributeResolver, codepoint, charAttributes); + getAttributes(ucdVersion, attributeResolver, CodePoint, outputRange); + buildChar(writer, attributeResolver, CodePoint, charAttributes); } private static void buildGroupedChar( UCDXMLWriter writer, AttributeResolver attributeResolver, VersionInfo ucdVersion, - int codepoint, + int CodePoint, UCDXMLOUTPUTRANGE outputRange, AttributesImpl groupAttrs) throws SAXException { AttributesImpl orgCharAttributes = - getAttributes(ucdVersion, attributeResolver, codepoint, outputRange); + getAttributes(ucdVersion, attributeResolver, CodePoint, outputRange); AttributesImpl charAttributes = new AttributesImpl(); charAttributes.addAttribute( - NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(CodePoint)); - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { String qName = propDetail.getUcdProperty().getShortName(); if (qName.startsWith("cjk")) { qName = qName.substring(2); @@ -483,18 +501,18 @@ private static void buildGroupedChar( Objects.requireNonNullElse(orgCharAttributesValue, "")); } } - buildChar(writer, attributeResolver, codepoint, charAttributes); + buildChar(writer, attributeResolver, CodePoint, charAttributes); } private static void buildChar( UCDXMLWriter writer, AttributeResolver attributeResolver, - int codepoint, + int CodePoint, AttributesImpl charAttributes) throws SAXException { writer.startElement("char", charAttributes); { - HashMap nameAliases = attributeResolver.getNameAliases(codepoint); + HashMap nameAliases = attributeResolver.getNameAliases(CodePoint); if (null != nameAliases && !nameAliases.isEmpty()) { for (String alias : nameAliases.keySet()) { AttributesImpl nameAliasAt = new AttributesImpl(); @@ -543,7 +561,7 @@ private static void buildGroupedRange( attributeResolver.getHexString(range.get(range.size() - 1))); } - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { String qName = propDetail.getUcdProperty().getShortName(); if (qName.startsWith("cjk")) { qName = qName.substring(2); @@ -580,20 +598,20 @@ private static void buildUngroupedRange( } } - private static boolean isWritableCodepoint( - int codepoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { + private static boolean isWritableCodePoint( + int CodePoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { return outputRange == UCDXMLOUTPUTRANGE.ALL || (outputRange == UCDXMLOUTPUTRANGE.UNIHAN - && attributeResolver.isUnihanAttributeRange(codepoint)) + && attributeResolver.isUnihanAttributeRange(CodePoint)) || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN - && !attributeResolver.isUnifiedIdeograph(codepoint)); + && !attributeResolver.isUnifiedIdeograph(CodePoint)); } - private static Range getRangeType(AttributeResolver attributeResolver, int codepoint) { - String NChar = attributeResolver.getNChar(codepoint); - UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(codepoint); + private static Range getRangeType(AttributeResolver attributeResolver, int CodePoint) { + String NChar = attributeResolver.getNChar(CodePoint); + UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(CodePoint); - if (attributeResolver.isUnihanAttributeRange(codepoint)) { + if (attributeResolver.isUnihanAttributeRange(CodePoint)) { return Range.CJKUNIFIEDIDEOGRAPH; } if (gc.equals(UcdPropertyValues.General_Category_Values.Surrogate)) { @@ -608,44 +626,44 @@ private static Range getRangeType(AttributeResolver attributeResolver, int codep return Range.RESERVED; } - private static int getLastCodepointInGroup( - AttributeResolver attributeResolver, int lowCodepoint, int highCodepoint) { - String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodepoint); - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, codepoint))) { - return codepoint - 1; + private static int getLastCodePointInGroup( + AttributeResolver attributeResolver, int lowCodePoint, int highCodePoint) { + String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodePoint); + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, CodePoint))) { + return CodePoint - 1; } - if (codepoint == 0x20 - 1 // put the C0 controls in their own group - || codepoint == 0xa0 - 1 // put the C0 controls in their own group - || codepoint == 0x1160 - 1 // split the jamos into three groups - || codepoint == 0x11a8 - 1 // split the jamos into three groups - || codepoint == 0x1f1e6 - 1 // put the regional indicators in their own group + if (CodePoint == 0x20 - 1 // put the C0 controls in their own group + || CodePoint == 0xa0 - 1 // put the C1 controls in their own group + || CodePoint == 0x1160 - 1 // split the jamos into three groups + || CodePoint == 0x11a8 - 1 // split the jamos into three groups + || CodePoint == 0x1f1e6 - 1 // put the regional indicators in their own group ) { - return codepoint; + return CodePoint; } } - return highCodepoint; + return highCodePoint; } private static AttributesImpl getAttributes( VersionInfo version, AttributeResolver attributeResolver, - int codepoint, + int CodePoint, UCDXMLOUTPUTRANGE outputRange) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute( - NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(codepoint)); + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(CodePoint)); - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { UcdProperty prop = propDetail.getUcdProperty(); if (version.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null || version.compareTo(propDetail.getMaxVersion()) < 0)) { - String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + String attrValue = attributeResolver.getAttributeValue(prop, CodePoint); boolean isAttributeIncluded = getIsAttributeIncluded( attrValue, - attributeResolver.isUnihanAttributeRange(codepoint), + attributeResolver.isUnihanAttributeRange(CodePoint), propDetail, prop, outputRange); @@ -664,12 +682,12 @@ private static AttributesImpl getAttributes( private static AttributesImpl getGroupAttributes( VersionInfo version, AttributeResolver attributeResolver, - int lowCodepoint, - int highCodepoint, + int lowCodePoint, + int highCodePoint, UCDXMLOUTPUTRANGE outputRange) { AttributesImpl attributes = new AttributesImpl(); - for (UcdPropertyDetail propDetail : UcdPropertyDetail.ucdxmlValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { UcdProperty prop = propDetail.getUcdProperty(); if (version.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null @@ -677,9 +695,9 @@ private static AttributesImpl getGroupAttributes( int totalCount = 0; Map counters = new LinkedHashMap<>(); - for (int codepoint = lowCodepoint; codepoint <= highCodepoint; codepoint++) { - if (!attributeResolver.isUnassignedCodepoint(codepoint)) { - String attrValue = attributeResolver.getAttributeValue(prop, codepoint); + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (!attributeResolver.isUnassignedCodePoint(CodePoint)) { + String attrValue = attributeResolver.getAttributeValue(prop, CodePoint); int currentCount = (counters.get(attrValue) == null) ? 0 : counters.get(attrValue); currentCount++; @@ -714,7 +732,7 @@ private static AttributesImpl getGroupAttributes( boolean isAttributeIncluded = getIsAttributeIncluded( bestAttrValue, - attributeResolver.isUnihanAttributeRange(lowCodepoint), + attributeResolver.isUnihanAttributeRange(lowCodePoint), propDetail, prop, outputRange); @@ -735,7 +753,7 @@ private static AttributesImpl getGroupAttributes( private static boolean getIsAttributeIncluded( String attrValue, boolean isUnihanAttributeRange, - UcdPropertyDetail propDetail, + UCDPropertyDetail propDetail, UcdProperty prop, UCDXMLOUTPUTRANGE outputRange) { if (attrValue == null) { @@ -786,7 +804,7 @@ private static AttributesImpl getReservedAttributes( "CDATA", attributeResolver.getHexString(range.get(range.size() - 1))); } - for (UcdPropertyDetail propDetail : UcdPropertyDetail.baseValues()) { + for (UCDPropertyDetail propDetail : UCDPropertyDetail.baseValues()) { UcdProperty prop = propDetail.getUcdProperty(); if (version.compareTo(propDetail.getMinVersion()) >= 0 && (propDetail.getMaxVersion() == null diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java index ff31e69c61..178d194e34 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java @@ -1,6 +1,8 @@ package org.unicode.xml; import java.io.FileOutputStream; +import java.text.SimpleDateFormat; +import java.util.Date; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; @@ -11,6 +13,9 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +/** + * Helper class for writing the contents for the UCDXML files. + */ public class UCDXMLWriter { public static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; @@ -36,11 +41,11 @@ public UCDXMLWriter(FileOutputStream f) throws TransformerConfigurationException } public void startFile() throws SAXException { + String copyrightYear = new SimpleDateFormat("yyyy").format(new Date()); transformerHandler.startDocument(); char[] c = "\n".toCharArray(); transformerHandler.characters(c, 0, c.length); - // TODO: JRW change hardcoded 2023 to current year. - c = " \u00A9 2023 Unicode\u00AE, Inc. ".toCharArray(); + c = (" \u00A9 " + copyrightYear + " Unicode\u00AE, Inc. ").toCharArray(); transformerHandler.comment(c, 0, c.length); c = "\n".toCharArray(); transformerHandler.characters(c, 0, c.length); diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java index 5cd2df3af4..b58b50f070 100644 --- a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -4,14 +4,24 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.util.*; -import java.util.Map.Entry; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.unicode.cldr.util.XMLFileReader; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.utility.Utility; import org.xml.sax.*; +/** + * Helper class for org.unicode.xml.CompareUCDXML. + * Facilitates traversal of the contents of a UCDXML file. + */ public class XMLProperties { enum XmlLeaf { @@ -187,7 +197,7 @@ public void startElement( case SURROGATE: case NONCHARACTER: parseCp(attributes); - for (final Entry entry : attributes.entrySet()) { + for (final Map.Entry entry : attributes.entrySet()) { doAttributes(entry.getKey(), entry.getValue()); } if (xmlLeaf == XmlLeaf.NONCHARACTER) { diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index.xml b/unicodetools/src/main/resources/org/unicode/uax42/index.xml index 6b4733a2b0..c0f05f5c2c 100644 --- a/unicodetools/src/main/resources/org/unicode/uax42/index.xml +++ b/unicodetools/src/main/resources/org/unicode/uax42/index.xml @@ -12,7 +12,7 @@ stage='proposed-update' schema='rnc' prevrev='34'/> - 2024 + 2025 @@ -21,7 +21,7 @@ - + New value for the age attribute: 16.0. From 5d084fbcb336aed517a937aff4eaf1dca77b7e56 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Fri, 7 Feb 2025 16:12:28 -0800 Subject: [PATCH 05/10] Ran spotless --- .../org/unicode/tools/emoji/LoadImage.java | 3 +- .../org/unicode/xml/AttributeResolver.java | 20 ++++++------- .../java/org/unicode/xml/CompareUCDXML.java | 7 ++--- .../unicode/xml/GeneratePropertyValues.java | 12 ++++---- .../java/org/unicode/xml/UCDDataResolver.java | 13 ++++----- .../org/unicode/xml/UCDPropertyDetail.java | 28 +++++++++---------- .../org/unicode/xml/UCDSectionComponent.java | 4 +-- .../org/unicode/xml/UCDSectionDetail.java | 4 +-- .../src/main/java/org/unicode/xml/UCDXML.java | 6 ++-- .../java/org/unicode/xml/UCDXMLWriter.java | 4 +-- .../java/org/unicode/xml/XMLProperties.java | 5 ++-- .../unittest/TestLocaleConstruction.java | 6 ++-- 12 files changed, 51 insertions(+), 61 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java b/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java index 7f9f3008b0..c7912b6906 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java @@ -891,7 +891,8 @@ public static void doSb(String outputDir) throws IOException { // try { // copy(new URL(url), new File(outputDir + "/sb","sb_" + code + ".gif")); //// BufferedImage sourceImage = ImageIO.read(new URL(url)); - //// writeImage(sourceImage,outputDir + "/sb","sb_" + code, "gif"); + //// writeImage(sourceImage,outputDir + "/sb","sb_" + code, + // "gif"); // System.out.println(code); // } catch (Exception e) { // System.out.println("Skipping " + code); diff --git a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java index 38d05786d5..2d268878ec 100644 --- a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java +++ b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java @@ -2,6 +2,12 @@ import com.ibm.icu.impl.UnicodeMap; import com.ibm.icu.util.VersionInfo; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyParsingInfo; @@ -10,17 +16,10 @@ import org.unicode.props.UcdPropertyValues; import org.unicode.props.UnicodeProperty; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; -import java.util.Optional; - /** - * Used by UCDXML to get string values of attributes for each code point from IndexUnicodeProperties. + * Used by UCDXML to get string values of attributes for each code point from + * IndexUnicodeProperties. */ - public class AttributeResolver { private final IndexUnicodeProperties indexUnicodeProperties; @@ -249,7 +248,8 @@ public String getAttributeValue(UcdProperty prop, int codepoint) { .toLowerCase(Locale.ROOT); default: final UnicodeProperty property = indexUnicodeProperties.getProperty(prop); - final List valueAliases = property.getValueAliases(property.getValue(codepoint)); + final List valueAliases = + property.getValueAliases(property.getValue(codepoint)); return valueAliases.get(0); } case Binary: diff --git a/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java index d2876c1229..f09f98e86b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java @@ -3,7 +3,6 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.impl.UnicodeMap; import com.ibm.icu.text.UnicodeSet; - import java.io.File; import java.io.IOException; import java.util.HashMap; @@ -11,9 +10,9 @@ import org.unicode.props.UcdProperty; /** - * Utility for comparing two UCDXML files. - * Originally intended to compare UCDXML files generated using https://github.com/eric-muller/ucdxml to UCDXML files - * generated using org.unicode.xml.UCDXML. + * Utility for comparing two UCDXML files. Originally intended to compare UCDXML files generated + * using https://github.com/eric-muller/ucdxml to UCDXML files generated using + * org.unicode.xml.UCDXML. */ public class CompareUCDXML { diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java index 03f10a428b..8b51c7350b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -2,7 +2,6 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.util.VersionInfo; - import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; @@ -24,8 +23,9 @@ import org.unicode.props.UcdPropertyValues.*; /** - * Utility for generating fragments that describe the property values in a format that can be displayed in UAX42. - * UAX42 fragments live in unicodetools/src/main/resources/org/unicode/uax42/fragments + * Utility for generating fragments that describe the property values in a format that can be + * displayed in UAX42. UAX42 fragments live in + * unicodetools/src/main/resources/org/unicode/uax42/fragments */ public class GeneratePropertyValues { @@ -417,10 +417,8 @@ private static void createPropertyFragment( writer.close(); } - private static BufferedWriter getFragmentWriter(String filename) - throws IOException { - File fragmentFolder = - new File(destinationFolder + File.separator); + private static BufferedWriter getFragmentWriter(String filename) throws IOException { + File fragmentFolder = new File(destinationFolder + File.separator); if (!fragmentFolder.exists()) { if (!fragmentFolder.mkdir()) { throw new IOException(); diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java index d30693e838..d607a661f6 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java @@ -1,6 +1,10 @@ package org.unicode.xml; import com.ibm.icu.util.VersionInfo; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyParsingInfo; @@ -8,14 +12,7 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; - -/** - * Helper class for building sections of UCDXML files based on IndexUnicodeProperties values. - */ +/** Helper class for building sections of UCDXML files based on IndexUnicodeProperties values. */ public class UCDDataResolver { private final IndexUnicodeProperties indexUnicodeProperties; diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java index 39192fd36b..9dab8117b2 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java @@ -6,8 +6,8 @@ import org.unicode.props.UcdProperty; /** - * Helper class for determining how and when UCD properties should be shown in UCDXML. Also includes information - * about when a UCDProperty was added to Unicode. + * Helper class for determining how and when UCD properties should be shown in UCDXML. Also includes + * information about when a UCDProperty was added to Unicode. */ public class UCDPropertyDetail { @@ -2100,11 +2100,20 @@ public class UCDPropertyDetail { true, false, true); + public static UCDPropertyDetail kZhuang_Detail = + new UCDPropertyDetail( + UcdProperty.kZhuang, + VersionInfo.getInstance(16, 0, 0), + 236, + false, + true, + false, + true); public static UCDPropertyDetail kZhuangNumeric_Detail = new UCDPropertyDetail( UcdProperty.kZhuangNumeric, VersionInfo.getInstance(15, 1, 0), - 236, + 237, false, true, false, @@ -2113,7 +2122,7 @@ public class UCDPropertyDetail { new UCDPropertyDetail( UcdProperty.Indic_Conjunct_Break, VersionInfo.getInstance(15, 1, 0), - 237, + 238, true, false, false, @@ -2122,7 +2131,7 @@ public class UCDPropertyDetail { new UCDPropertyDetail( UcdProperty.Modifier_Combining_Mark, VersionInfo.getInstance(16, 0, 0), - 238, + 239, true, false, false, @@ -2131,15 +2140,6 @@ public class UCDPropertyDetail { new UCDPropertyDetail( UcdProperty.kFanqie, VersionInfo.getInstance(16, 0, 0), - 239, - false, - true, - false, - true); - public static UCDPropertyDetail kZhuang_Detail = - new UCDPropertyDetail( - UcdProperty.kZhuang, - VersionInfo.getInstance(16, 0, 0), 240, false, true, diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java index 550fcbbaf7..0cef1e345b 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java @@ -3,9 +3,7 @@ import com.ibm.icu.util.VersionInfo; import org.unicode.props.UcdProperty; -/** - * Helper class that defines an object that stores the version range of a given UcdProperty. - */ +/** Helper class that defines an object that stores the version range of a given UcdProperty. */ public class UCDSectionComponent { private final VersionInfo minVersion; private final VersionInfo maxVersion; diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java index ac84a5a414..6db3cf82bb 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java @@ -5,8 +5,8 @@ /** * Helper class that defines an object that stores information about a section of the UCDXML file. - * Information includes the section name, the type of elements that the section contains, and the version range of - * the section. + * Information includes the section name, the type of elements that the section contains, and the + * version range of the section. */ public class UCDSectionDetail { diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java index a07be9c21f..6bcfa74510 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -2,7 +2,6 @@ import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.util.VersionInfo; - import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; @@ -28,9 +27,8 @@ import org.xml.sax.helpers.AttributesImpl; /** - * Utility for generating UCDXML files. - * The utility can build flat or grouped versions of UCDXML for non-Unihan code points, Unihan code points, or the - * complete range of code points. + * Utility for generating UCDXML files. The utility can build flat or grouped versions of UCDXML for + * non-Unihan code points, Unihan code points, or the complete range of code points. */ public class UCDXML { diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java index 178d194e34..7358ed26f0 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java @@ -13,9 +13,7 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -/** - * Helper class for writing the contents for the UCDXML files. - */ +/** Helper class for writing the contents for the UCDXML files. */ public class UCDXMLWriter { public static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java index b58b50f070..d1f6e178e8 100644 --- a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -11,7 +11,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.unicode.cldr.util.XMLFileReader; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; @@ -19,8 +18,8 @@ import org.xml.sax.*; /** - * Helper class for org.unicode.xml.CompareUCDXML. - * Facilitates traversal of the contents of a UCDXML file. + * Helper class for org.unicode.xml.CompareUCDXML. Facilitates traversal of the contents of a UCDXML + * file. */ public class XMLProperties { diff --git a/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java b/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java index eb31c04520..9b6dd49835 100644 --- a/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java +++ b/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java @@ -377,7 +377,8 @@ void buildLocale(Multimap args) { //// AliasesFull aliases = new AliasesFull(dataType); //// Output> exception = new Output<>(); //// - //// for (Entry entry : validityInfo.get(dataType).entrySet()) + //// for (Entry entry : + // validityInfo.get(dataType).entrySet()) // { //// for (String code : entry.getValue().regularData) { //// String replacement = aliases.getCanonical( @@ -388,7 +389,8 @@ void buildLocale(Multimap args) { //// if (replacement != null) { //// if (DEBUG) System.out.println(code + " ==> " + replacement); //// } else if (exception.value != null){ - //// if (DEBUG) System.out.println(code + " ==> " + exception.toString()); + //// if (DEBUG) System.out.println(code + " ==> " + + // exception.toString()); //// } //// } //// } From 9327e29b414f60ef0ba33defa2119e0140a7cd61 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Sat, 8 Feb 2025 12:35:11 -0800 Subject: [PATCH 06/10] Ran GenerateEnums --- .../java/org/unicode/props/UcdProperty.java | 119 ++-- .../org/unicode/props/UcdPropertyValues.java | 608 ++++++++---------- 2 files changed, 310 insertions(+), 417 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index 60efa304db..73760b1d8d 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -44,13 +44,13 @@ */ public enum UcdProperty { - // Numeric + // Numeric Numeric_Value(PropertyType.Numeric, "nv"), kAccountingNumeric(PropertyType.Numeric, "cjkAccountingNumeric"), kOtherNumeric(PropertyType.Numeric, "cjkOtherNumeric"), kPrimaryNumeric(PropertyType.Numeric, null, ValueCardinality.Ordered, "cjkPrimaryNumeric"), - // String + // String Bidi_Mirroring_Glyph(PropertyType.String, "bmg"), Bidi_Paired_Bracket(PropertyType.String, "bpb"), Case_Folding(PropertyType.String, "cf"), @@ -73,12 +73,10 @@ public enum UcdProperty { Titlecase_Mapping(PropertyType.String, "tc"), Uppercase_Mapping(PropertyType.String, "uc"), kCompatibilityVariant(PropertyType.String, "cjkCompatibilityVariant"), - kSimplifiedVariant( - PropertyType.String, null, ValueCardinality.Unordered, "cjkSimplifiedVariant"), - kTraditionalVariant( - PropertyType.String, null, ValueCardinality.Unordered, "cjkTraditionalVariant"), + kSimplifiedVariant(PropertyType.String, null, ValueCardinality.Unordered, "cjkSimplifiedVariant"), + kTraditionalVariant(PropertyType.String, null, ValueCardinality.Unordered, "cjkTraditionalVariant"), - // Miscellaneous + // Miscellaneous CJK_Radical(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "CJKR"), Emoji_DCM(PropertyType.Miscellaneous, "EDCM"), Emoji_KDDI(PropertyType.Miscellaneous, "EKDDI"), @@ -94,16 +92,12 @@ public enum UcdProperty { Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"), Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"), Unicode_1_Name(PropertyType.Miscellaneous, "na1"), + emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"), kAlternateHanYu(PropertyType.Miscellaneous, "cjkAlternateHanYu"), kAlternateJEF(PropertyType.Miscellaneous, "cjkAlternateJEF"), kAlternateKangXi(PropertyType.Miscellaneous, "cjkAlternateKangXi"), kAlternateMorohashi(PropertyType.Miscellaneous, "cjkAlternateMorohashi"), - kAlternateTotalStrokes( - PropertyType.Miscellaneous, - null, - ValueCardinality.Unordered, - "cjkAlternateTotalStrokes"), - emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"), + kAlternateTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkAlternateTotalStrokes"), kBigFive(PropertyType.Miscellaneous, "cjkBigFive"), kCCCII(PropertyType.Miscellaneous, "cjkCCCII"), kCNS1986(PropertyType.Miscellaneous, "cjkCNS1986"), @@ -111,8 +105,7 @@ public enum UcdProperty { kCangjie(PropertyType.Miscellaneous, "cjkCangjie"), kCantonese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCantonese"), kCheungBauer(PropertyType.Miscellaneous, "cjkCheungBauer"), - kCheungBauerIndex( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCheungBauerIndex"), + kCheungBauerIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCheungBauerIndex"), kCihaiT(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCihaiT"), kCowles(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCowles"), kDaeJaweon(PropertyType.Miscellaneous, "cjkDaeJaweon"), @@ -129,8 +122,7 @@ public enum UcdProperty { kFanqie(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFanqie"), kFenn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFenn"), kFennIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFennIndex"), - kFourCornerCode( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFourCornerCode"), + kFourCornerCode(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFourCornerCode"), kFrequency(PropertyType.Miscellaneous, "cjkFrequency"), kGB0(PropertyType.Miscellaneous, "cjkGB0"), kGB1(PropertyType.Miscellaneous, "cjkGB1"), @@ -170,8 +162,7 @@ public enum UcdProperty { kJapanese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapanese"), kJapaneseKun(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseKun"), kJapaneseOn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseOn"), - kJinmeiyoKanji( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJinmeiyoKanji"), + kJinmeiyoKanji(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJinmeiyoKanji"), kJis0(PropertyType.Miscellaneous, "cjkJis0"), kJis1(PropertyType.Miscellaneous, "cjkJis1"), kJoyoKanji(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJoyoKanji"), @@ -182,11 +173,7 @@ public enum UcdProperty { kKangXi(PropertyType.Miscellaneous, "cjkKangXi"), kKarlgren(PropertyType.Miscellaneous, "cjkKarlgren"), kKorean(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkKorean"), - kKoreanEducationHanja( - PropertyType.Miscellaneous, - null, - ValueCardinality.Unordered, - "cjkKoreanEducationHanja"), + kKoreanEducationHanja(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkKoreanEducationHanja"), kKoreanName(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkKoreanName"), kLau(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkLau"), kMainlandTelegraph(PropertyType.Miscellaneous, "cjkMainlandTelegraph"), @@ -198,36 +185,21 @@ public enum UcdProperty { kNelson(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkNelson"), kPhonetic(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkPhonetic"), kPseudoGB1(PropertyType.Miscellaneous, "cjkPseudoGB1"), - kRSAdobe_Japan1_6( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkRSAdobe_Japan1_6"), + kRSAdobe_Japan1_6(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkRSAdobe_Japan1_6"), kRSJapanese(PropertyType.Miscellaneous, "cjkRSJapanese"), kRSKanWa(PropertyType.Miscellaneous, "cjkRSKanWa"), kRSKangXi(PropertyType.Miscellaneous, "cjkRSKangXi"), kRSKorean(PropertyType.Miscellaneous, "cjkRSKorean"), kRSMerged(PropertyType.Miscellaneous, "cjkRSMerged"), kRSTUnicode(PropertyType.Miscellaneous, "kRSTUnicode"), - kRSUnicode( - PropertyType.Miscellaneous, - null, - ValueCardinality.Ordered, - "cjkRSUnicode", - "Unicode_Radical_Stroke", - "URS"), + kRSUnicode(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkRSUnicode", "Unicode_Radical_Stroke", "URS"), kReading(PropertyType.Miscellaneous, "kReading"), kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"), - kSMSZD2003Index( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"), - kSMSZD2003Readings( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"), - kSemanticVariant( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"), - kSpecializedSemanticVariant( - PropertyType.Miscellaneous, - null, - ValueCardinality.Unordered, - "cjkSpecializedSemanticVariant"), - kSpoofingVariant( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"), + kSMSZD2003Index(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"), + kSMSZD2003Readings(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"), + kSemanticVariant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"), + kSpecializedSemanticVariant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpecializedSemanticVariant"), + kSpoofingVariant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"), kSrc_NushuDuben(PropertyType.Miscellaneous, "kSrc_NushuDuben"), kStrange(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkStrange"), kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"), @@ -238,48 +210,36 @@ public enum UcdProperty { kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"), kUnihanCore2020(PropertyType.Miscellaneous, "cjkUnihanCore2020"), kVietnamese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnamese"), - kVietnameseNumeric( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"), + kVietnameseNumeric(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"), kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"), kXerox(PropertyType.Miscellaneous, "cjkXerox"), kZVariant(PropertyType.Miscellaneous, "cjkZVariant"), kZhuang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuang"), - kZhuangNumeric( - PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"), + kZhuangNumeric(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"), - // Catalog + // Catalog Age(PropertyType.Catalog, Age_Values.class, null, "age"), Block(PropertyType.Catalog, Block_Values.class, null, "blk"), Script(PropertyType.Catalog, Script_Values.class, null, "sc"), Script_Extensions(PropertyType.Catalog, Script_Values.class, ValueCardinality.Unordered, "scx"), - // Enumerated + // Enumerated Bidi_Class(PropertyType.Enumerated, Bidi_Class_Values.class, null, "bc"), - Bidi_Paired_Bracket_Type( - PropertyType.Enumerated, Bidi_Paired_Bracket_Type_Values.class, null, "bpt"), - Canonical_Combining_Class( - PropertyType.Enumerated, Canonical_Combining_Class_Values.class, null, "ccc"), + Bidi_Paired_Bracket_Type(PropertyType.Enumerated, Bidi_Paired_Bracket_Type_Values.class, null, "bpt"), + Canonical_Combining_Class(PropertyType.Enumerated, Canonical_Combining_Class_Values.class, null, "ccc"), Decomposition_Type(PropertyType.Enumerated, Decomposition_Type_Values.class, null, "dt"), - Do_Not_Emit_Type( - PropertyType.Enumerated, Do_Not_Emit_Type_Values.class, null, "Do_Not_Emit_Type"), + Do_Not_Emit_Type(PropertyType.Enumerated, Do_Not_Emit_Type_Values.class, null, "Do_Not_Emit_Type"), East_Asian_Width(PropertyType.Enumerated, East_Asian_Width_Values.class, null, "ea"), General_Category(PropertyType.Enumerated, General_Category_Values.class, null, "gc"), - Grapheme_Cluster_Break( - PropertyType.Enumerated, Grapheme_Cluster_Break_Values.class, null, "GCB"), + Grapheme_Cluster_Break(PropertyType.Enumerated, Grapheme_Cluster_Break_Values.class, null, "GCB"), Hangul_Syllable_Type(PropertyType.Enumerated, Hangul_Syllable_Type_Values.class, null, "hst"), Identifier_Status(PropertyType.Enumerated, Identifier_Status_Values.class, null, "ID_Status"), - Identifier_Type( - PropertyType.Enumerated, - Identifier_Type_Values.class, - ValueCardinality.Unordered, - "ID_Type"), + Identifier_Type(PropertyType.Enumerated, Identifier_Type_Values.class, ValueCardinality.Unordered, "ID_Type"), Idn_2008(PropertyType.Enumerated, Idn_2008_Values.class, null, "idn8"), Idn_Status(PropertyType.Enumerated, Idn_Status_Values.class, null, "idns"), Indic_Conjunct_Break(PropertyType.Enumerated, Indic_Conjunct_Break_Values.class, null, "InCB"), - Indic_Positional_Category( - PropertyType.Enumerated, Indic_Positional_Category_Values.class, null, "InPC"), - Indic_Syllabic_Category( - PropertyType.Enumerated, Indic_Syllabic_Category_Values.class, null, "InSC"), + Indic_Positional_Category(PropertyType.Enumerated, Indic_Positional_Category_Values.class, null, "InPC"), + Indic_Syllabic_Category(PropertyType.Enumerated, Indic_Syllabic_Category_Values.class, null, "InSC"), Joining_Group(PropertyType.Enumerated, Joining_Group_Values.class, null, "jg"), Joining_Type(PropertyType.Enumerated, Joining_Type_Values.class, null, "jt"), Line_Break(PropertyType.Enumerated, Line_Break_Values.class, null, "lb"), @@ -288,14 +248,13 @@ public enum UcdProperty { NFKC_Quick_Check(PropertyType.Enumerated, NFKC_Quick_Check_Values.class, null, "NFKC_QC"), NFKD_Quick_Check(PropertyType.Enumerated, NFKD_Quick_Check_Values.class, null, "NFKD_QC"), Numeric_Type(PropertyType.Enumerated, Numeric_Type_Values.class, null, "nt"), - Other_Joining_Type( - PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"), + Other_Joining_Type(PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"), Sentence_Break(PropertyType.Enumerated, Sentence_Break_Values.class, null, "SB"), Vertical_Orientation(PropertyType.Enumerated, Vertical_Orientation_Values.class, null, "vo"), Word_Break(PropertyType.Enumerated, Word_Break_Values.class, null, "WB"), kEH_Core(PropertyType.Enumerated, kEH_Core_Values.class, null, "kEH_Core"), - // Binary + // Binary ASCII_Hex_Digit(PropertyType.Binary, Binary.class, null, "AHex"), Alphabetic(PropertyType.Binary, Binary.class, null, "Alpha"), Basic_Emoji(PropertyType.Binary, Binary.class, null, "BE"), @@ -358,10 +317,8 @@ public enum UcdProperty { Prepended_Concatenation_Mark(PropertyType.Binary, Binary.class, null, "PCM"), Quotation_Mark(PropertyType.Binary, Binary.class, null, "QMark"), RGI_Emoji_Flag_Sequence(PropertyType.Binary, Binary.class, null, "REFS", "Emoji_Flag_Sequence"), - RGI_Emoji_Keycap_Sequence( - PropertyType.Binary, Binary.class, null, "REKS", "Emoji_Keycap_Sequence"), - RGI_Emoji_Modifier_Sequence( - PropertyType.Binary, Binary.class, null, "REMS", "Emoji_Modifier_Sequence"), + RGI_Emoji_Keycap_Sequence(PropertyType.Binary, Binary.class, null, "REKS", "Emoji_Keycap_Sequence"), + RGI_Emoji_Modifier_Sequence(PropertyType.Binary, Binary.class, null, "REMS", "Emoji_Modifier_Sequence"), RGI_Emoji_Tag_Sequence(PropertyType.Binary, Binary.class, null, "RETS", "Emoji_Tag_Sequence"), RGI_Emoji_Zwj_Sequence(PropertyType.Binary, Binary.class, null, "REZS", "Emoji_Zwj_Sequence"), Radical(PropertyType.Binary, Binary.class, null, "Radical"), @@ -378,17 +335,17 @@ public enum UcdProperty { kEH_NoMirror(PropertyType.Binary, Binary.class, null, "kEH_NoMirror"), kEH_NoRotate(PropertyType.Binary, Binary.class, null, "kEH_NoRotate"), -// Unknown -; + // Unknown + ; - private final PropertyType type; +private final PropertyType type; private final PropertyNames names; // for enums private final NameMatcher name2enum; private final EnumSet enums; private final Class enumClass; private final ValueCardinality cardinality; - + private UcdProperty(PropertyType type, String shortName, String... otherNames) { this.type = type; names = new PropertyNames(UcdProperty.class, this, shortName, otherNames); @@ -417,7 +374,7 @@ private UcdProperty( enumClass = classItem; } } - + public ValueCardinality getCardinality() { return cardinality; } diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index 0aac98c263..1e1194a841 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -17,7 +17,8 @@ public enum Binary implements Named { private final PropertyNames names; private Binary(String shortName, String... otherNames) { - names = new PropertyNames(Binary.class, this, shortName, otherNames); + names = new PropertyNames( + Binary.class, this, shortName, otherNames); } @Override @@ -30,8 +31,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Binary.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Binary.class); public static Binary forName(String name) { return NAME_MATCHER.get(name); @@ -63,7 +63,7 @@ public enum Age_Values implements Named { V12_1("12.1"), V13_0("13.0"), V13_1("13.1"), // TODO: there is no Unicode 13.1, see - // https://github.com/unicode-org/unicodetools/issues/100 +// https://github.com/unicode-org/unicodetools/issues/100 V14_0("14.0"), V15_0("15.0"), V15_1("15.1"), @@ -73,7 +73,8 @@ public enum Age_Values implements Named { private final PropertyNames names; private Age_Values(String shortName, String... otherNames) { - names = new PropertyNames(Age_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Age_Values.class, this, shortName, otherNames); } @Override @@ -86,8 +87,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Age_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Age_Values.class); public static Age_Values forName(String name) { return NAME_MATCHER.get(name); @@ -121,9 +121,8 @@ public enum Bidi_Class_Values implements Named { private final PropertyNames names; private Bidi_Class_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Bidi_Class_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Bidi_Class_Values.class, this, shortName, otherNames); } @Override @@ -136,16 +135,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Bidi_Class_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Bidi_Class_Values.class); public static Bidi_Class_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Bidi_Mirroring_Glyph - // Bidi_Paired_Bracket + // Bidi_Mirroring_Glyph + // Bidi_Paired_Bracket public enum Bidi_Paired_Bracket_Type_Values implements Named { Close("c"), None("n"), @@ -153,9 +151,8 @@ public enum Bidi_Paired_Bracket_Type_Values implements Named { private final PropertyNames names; private Bidi_Paired_Bracket_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Bidi_Paired_Bracket_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Bidi_Paired_Bracket_Type_Values.class, this, shortName, otherNames); } @Override @@ -168,8 +165,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Bidi_Paired_Bracket_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Bidi_Paired_Bracket_Type_Values.class); public static Bidi_Paired_Bracket_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -264,8 +260,7 @@ public enum Block_Values implements Named { Devanagari_Extended_A("Devanagari_Ext_A"), Combining_Diacritical_Marks("Diacriticals"), Combining_Diacritical_Marks_Extended("Diacriticals_Ext"), - Combining_Diacritical_Marks_For_Symbols( - "Diacriticals_For_Symbols", "Combining_Marks_For_Symbols"), + Combining_Diacritical_Marks_For_Symbols("Diacriticals_For_Symbols", "Combining_Marks_For_Symbols"), Combining_Diacritical_Marks_Supplement("Diacriticals_Sup"), Dingbats("Dingbats"), Dives_Akuru("Dives_Akuru"), @@ -529,9 +524,8 @@ public enum Block_Values implements Named { private final PropertyNames names; private Block_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Block_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Block_Values.class, this, shortName, otherNames); } @Override @@ -544,8 +538,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Block_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Block_Values.class); public static Block_Values forName(String name) { return NAME_MATCHER.get(name); @@ -614,9 +607,8 @@ public enum Canonical_Combining_Class_Values implements Named { private final PropertyNames names; private Canonical_Combining_Class_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Canonical_Combining_Class_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Canonical_Combining_Class_Values.class, this, shortName, otherNames); } @Override @@ -629,21 +621,20 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Canonical_Combining_Class_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Canonical_Combining_Class_Values.class); public static Canonical_Combining_Class_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Case_Folding - // CJK_Radical - // Confusable_MA - // Confusable_ML - // Confusable_SA - // Confusable_SL - // Decomposition_Mapping + // Case_Folding + // CJK_Radical + // Confusable_MA + // Confusable_ML + // Confusable_SA + // Confusable_SL + // Decomposition_Mapping public enum Decomposition_Type_Values implements Named { Canonical("Can", "can"), Compat("Com", "com"), @@ -666,9 +657,8 @@ public enum Decomposition_Type_Values implements Named { private final PropertyNames names; private Decomposition_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Decomposition_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Decomposition_Type_Values.class, this, shortName, otherNames); } @Override @@ -681,15 +671,14 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Decomposition_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Decomposition_Type_Values.class); public static Decomposition_Type_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Do_Not_Emit_Preferred + // Do_Not_Emit_Preferred public enum Do_Not_Emit_Type_Values implements Named { Indic_Atomic_Consonant("Indic_Atomic_Consonant"), Indic_Consonant_Conjunct("Indic_Consonant_Conjunct"), @@ -707,9 +696,8 @@ public enum Do_Not_Emit_Type_Values implements Named { private final PropertyNames names; private Do_Not_Emit_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Do_Not_Emit_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Do_Not_Emit_Type_Values.class, this, shortName, otherNames); } @Override @@ -722,8 +710,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Do_Not_Emit_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Do_Not_Emit_Type_Values.class); public static Do_Not_Emit_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -740,9 +727,8 @@ public enum East_Asian_Width_Values implements Named { private final PropertyNames names; private East_Asian_Width_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - East_Asian_Width_Values.class, this, shortName, otherNames); + names = new PropertyNames( + East_Asian_Width_Values.class, this, shortName, otherNames); } @Override @@ -755,20 +741,19 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(East_Asian_Width_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(East_Asian_Width_Values.class); public static East_Asian_Width_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Emoji_DCM - // Emoji_KDDI - // Emoji_SB - // emoji_variation_sequence - // Equivalent_Unified_Ideograph - // FC_NFKC_Closure + // Emoji_DCM + // Emoji_KDDI + // Emoji_SB + // emoji_variation_sequence + // Equivalent_Unified_Ideograph + // FC_NFKC_Closure public enum General_Category_Values implements Named { Other("C"), Control("Cc", "cntrl"), @@ -811,9 +796,8 @@ public enum General_Category_Values implements Named { private final PropertyNames names; private General_Category_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - General_Category_Values.class, this, shortName, otherNames); + names = new PropertyNames( + General_Category_Values.class, this, shortName, otherNames); } @Override @@ -826,8 +810,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(General_Category_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(General_Category_Values.class); public static General_Category_Values forName(String name) { return NAME_MATCHER.get(name); @@ -856,9 +839,8 @@ public enum Grapheme_Cluster_Break_Values implements Named { private final PropertyNames names; private Grapheme_Cluster_Break_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Grapheme_Cluster_Break_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Grapheme_Cluster_Break_Values.class, this, shortName, otherNames); } @Override @@ -871,8 +853,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Grapheme_Cluster_Break_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Grapheme_Cluster_Break_Values.class); public static Grapheme_Cluster_Break_Values forName(String name) { return NAME_MATCHER.get(name); @@ -889,9 +870,8 @@ public enum Hangul_Syllable_Type_Values implements Named { private final PropertyNames names; private Hangul_Syllable_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Hangul_Syllable_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Hangul_Syllable_Type_Values.class, this, shortName, otherNames); } @Override @@ -904,8 +884,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Hangul_Syllable_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Hangul_Syllable_Type_Values.class); public static Hangul_Syllable_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -918,9 +897,8 @@ public enum Identifier_Status_Values implements Named { private final PropertyNames names; private Identifier_Status_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Identifier_Status_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Identifier_Status_Values.class, this, shortName, otherNames); } @Override @@ -933,8 +911,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Identifier_Status_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Identifier_Status_Values.class); public static Identifier_Status_Values forName(String name) { return NAME_MATCHER.get(name); @@ -958,9 +935,8 @@ public enum Identifier_Type_Values implements Named { private final PropertyNames names; private Identifier_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Identifier_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Identifier_Type_Values.class, this, shortName, otherNames); } @Override @@ -973,8 +949,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Identifier_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Identifier_Type_Values.class); public static Identifier_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -988,9 +963,8 @@ public enum Idn_2008_Values implements Named { private final PropertyNames names; private Idn_2008_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Idn_2008_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Idn_2008_Values.class, this, shortName, otherNames); } @Override @@ -1003,15 +977,14 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Idn_2008_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Idn_2008_Values.class); public static Idn_2008_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Idn_Mapping + // Idn_Mapping public enum Idn_Status_Values implements Named { valid("v"), ignored("i"), @@ -1023,9 +996,8 @@ public enum Idn_Status_Values implements Named { private final PropertyNames names; private Idn_Status_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Idn_Status_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Idn_Status_Values.class, this, shortName, otherNames); } @Override @@ -1038,8 +1010,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Idn_Status_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Idn_Status_Values.class); public static Idn_Status_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1054,9 +1025,8 @@ public enum Indic_Conjunct_Break_Values implements Named { private final PropertyNames names; private Indic_Conjunct_Break_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Indic_Conjunct_Break_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Indic_Conjunct_Break_Values.class, this, shortName, otherNames); } @Override @@ -1069,8 +1039,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Indic_Conjunct_Break_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Indic_Conjunct_Break_Values.class); public static Indic_Conjunct_Break_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1097,9 +1066,8 @@ public enum Indic_Positional_Category_Values implements Named { private final PropertyNames names; private Indic_Positional_Category_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Indic_Positional_Category_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Indic_Positional_Category_Values.class, this, shortName, otherNames); } @Override @@ -1112,8 +1080,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Indic_Positional_Category_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Indic_Positional_Category_Values.class); public static Indic_Positional_Category_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1161,9 +1128,8 @@ public enum Indic_Syllabic_Category_Values implements Named { private final PropertyNames names; private Indic_Syllabic_Category_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Indic_Syllabic_Category_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Indic_Syllabic_Category_Values.class, this, shortName, otherNames); } @Override @@ -1176,15 +1142,14 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Indic_Syllabic_Category_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Indic_Syllabic_Category_Values.class); public static Indic_Syllabic_Category_Values forName(String name) { return NAME_MATCHER.get(name); } } - // ISO_Comment + // ISO_Comment public enum Jamo_Short_Name_Values implements Named { A("A"), AE("AE"), @@ -1241,9 +1206,8 @@ public enum Jamo_Short_Name_Values implements Named { private final PropertyNames names; private Jamo_Short_Name_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Jamo_Short_Name_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Jamo_Short_Name_Values.class, this, shortName, otherNames); } @Override @@ -1256,8 +1220,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Jamo_Short_Name_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Jamo_Short_Name_Values.class); public static Jamo_Short_Name_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1374,9 +1337,8 @@ public enum Joining_Group_Values implements Named { private final PropertyNames names; private Joining_Group_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Joining_Group_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Joining_Group_Values.class, this, shortName, otherNames); } @Override @@ -1389,8 +1351,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Joining_Group_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Joining_Group_Values.class); public static Joining_Group_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1407,9 +1368,8 @@ public enum Joining_Type_Values implements Named { private final PropertyNames names; private Joining_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Joining_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Joining_Type_Values.class, this, shortName, otherNames); } @Override @@ -1422,35 +1382,34 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Joining_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Joining_Type_Values.class); public static Joining_Type_Values forName(String name) { return NAME_MATCHER.get(name); } } - // kAccountingNumeric - // kAlternateHanYu - // kAlternateJEF - // kAlternateKangXi - // kAlternateMorohashi - // kAlternateTotalStrokes - // kBigFive - // kCangjie - // kCantonese - // kCCCII - // kCheungBauer - // kCheungBauerIndex - // kCihaiT - // kCNS1986 - // kCNS1992 - // kCompatibilityVariant - // kCowles - // kDaeJaweon - // kDefinition - // kEACC - // kEH_Cat + // kAccountingNumeric + // kAlternateHanYu + // kAlternateJEF + // kAlternateKangXi + // kAlternateMorohashi + // kAlternateTotalStrokes + // kBigFive + // kCangjie + // kCantonese + // kCCCII + // kCheungBauer + // kCheungBauerIndex + // kCihaiT + // kCNS1986 + // kCNS1992 + // kCompatibilityVariant + // kCowles + // kDaeJaweon + // kDefinition + // kEACC + // kEH_Cat public enum kEH_Core_Values implements Named { Core("C"), Legacy("L"), @@ -1458,9 +1417,8 @@ public enum kEH_Core_Values implements Named { private final PropertyNames names; private kEH_Core_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - kEH_Core_Values.class, this, shortName, otherNames); + names = new PropertyNames( + kEH_Core_Values.class, this, shortName, otherNames); } @Override @@ -1473,122 +1431,121 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(kEH_Core_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(kEH_Core_Values.class); public static kEH_Core_Values forName(String name) { return NAME_MATCHER.get(name); } } - // kEH_Desc - // kEH_Func - // kEH_FVal - // kEH_HG - // kEH_IFAO - // kEH_JSesh - // kEH_UniK - // kFanqie - // kFenn - // kFennIndex - // kFourCornerCode - // kFrequency - // kGB0 - // kGB1 - // kGB3 - // kGB5 - // kGB7 - // kGB8 - // kGradeLevel - // kGSR - // kHangul - // kHanYu - // kHanyuPinlu - // kHanyuPinyin - // kHDZRadBreak - // kHKGlyph - // kHKSCS - // kIBMJapan - // kIICore - // kIRG_GSource - // kIRG_HSource - // kIRG_JSource - // kIRG_KPSource - // kIRG_KSource - // kIRG_MSource - // kIRG_SSource - // kIRG_TSource - // kIRG_UKSource - // kIRG_USource - // kIRG_VSource - // kIRGDaeJaweon - // kIRGDaiKanwaZiten - // kIRGHanyuDaZidian - // kIRGKangXi - // kJa - // kJapanese - // kJapaneseKun - // kJapaneseOn - // kJHJ - // kJinmeiyoKanji - // kJis0 - // kJis1 - // kJIS0213 - // kJoyoKanji - // kKangXi - // kKarlgren - // kKorean - // kKoreanEducationHanja - // kKoreanName - // kKPS0 - // kKPS1 - // kKSC0 - // kKSC1 - // kLau - // kMainlandTelegraph - // kMandarin - // kMatthews - // kMeyerWempe - // kMojiJoho - // kMorohashi - // kNelson - // kOtherNumeric - // kPhonetic - // kPrimaryNumeric - // kPseudoGB1 - // kReading - // kRSAdobe_Japan1_6 - // kRSJapanese - // kRSKangXi - // kRSKanWa - // kRSKorean - // kRSMerged - // kRSTUnicode - // kRSUnicode - // kSBGY - // kSemanticVariant - // kSimplifiedVariant - // kSMSZD2003Index - // kSMSZD2003Readings - // kSpecializedSemanticVariant - // kSpoofingVariant - // kSrc_NushuDuben - // kStrange - // kTaiwanTelegraph - // kTang - // kTGH - // kTGHZ2013 - // kTGT_MergedSrc - // kTotalStrokes - // kTraditionalVariant - // kUnihanCore2020 - // kVietnamese - // kVietnameseNumeric - // kXerox - // kXHC1983 - // kZhuang - // kZhuangNumeric - // kZVariant + // kEH_Desc + // kEH_Func + // kEH_FVal + // kEH_HG + // kEH_IFAO + // kEH_JSesh + // kEH_UniK + // kFanqie + // kFenn + // kFennIndex + // kFourCornerCode + // kFrequency + // kGB0 + // kGB1 + // kGB3 + // kGB5 + // kGB7 + // kGB8 + // kGradeLevel + // kGSR + // kHangul + // kHanYu + // kHanyuPinlu + // kHanyuPinyin + // kHDZRadBreak + // kHKGlyph + // kHKSCS + // kIBMJapan + // kIICore + // kIRG_GSource + // kIRG_HSource + // kIRG_JSource + // kIRG_KPSource + // kIRG_KSource + // kIRG_MSource + // kIRG_SSource + // kIRG_TSource + // kIRG_UKSource + // kIRG_USource + // kIRG_VSource + // kIRGDaeJaweon + // kIRGDaiKanwaZiten + // kIRGHanyuDaZidian + // kIRGKangXi + // kJa + // kJapanese + // kJapaneseKun + // kJapaneseOn + // kJHJ + // kJinmeiyoKanji + // kJis0 + // kJis1 + // kJIS0213 + // kJoyoKanji + // kKangXi + // kKarlgren + // kKorean + // kKoreanEducationHanja + // kKoreanName + // kKPS0 + // kKPS1 + // kKSC0 + // kKSC1 + // kLau + // kMainlandTelegraph + // kMandarin + // kMatthews + // kMeyerWempe + // kMojiJoho + // kMorohashi + // kNelson + // kOtherNumeric + // kPhonetic + // kPrimaryNumeric + // kPseudoGB1 + // kReading + // kRSAdobe_Japan1_6 + // kRSJapanese + // kRSKangXi + // kRSKanWa + // kRSKorean + // kRSMerged + // kRSTUnicode + // kRSUnicode + // kSBGY + // kSemanticVariant + // kSimplifiedVariant + // kSMSZD2003Index + // kSMSZD2003Readings + // kSpecializedSemanticVariant + // kSpoofingVariant + // kSrc_NushuDuben + // kStrange + // kTaiwanTelegraph + // kTang + // kTGH + // kTGHZ2013 + // kTGT_MergedSrc + // kTotalStrokes + // kTraditionalVariant + // kUnihanCore2020 + // kVietnamese + // kVietnameseNumeric + // kXerox + // kXHC1983 + // kZhuang + // kZhuangNumeric + // kZVariant public enum Line_Break_Values implements Named { Ambiguous("AI"), Aksara("AK"), @@ -1641,9 +1598,8 @@ public enum Line_Break_Values implements Named { private final PropertyNames names; private Line_Break_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Line_Break_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Line_Break_Values.class, this, shortName, otherNames); } @Override @@ -1656,22 +1612,21 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Line_Break_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Line_Break_Values.class); public static Line_Break_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Lowercase_Mapping - // Name - // Name_Alias - // Named_Sequences - // Named_Sequences_Prov - // NC_Corrected - // NC_Original - // NC_Version + // Lowercase_Mapping + // Name + // Name_Alias + // Named_Sequences + // Named_Sequences_Prov + // NC_Corrected + // NC_Original + // NC_Version public enum NFC_Quick_Check_Values implements Named { Maybe("M"), No("N"), @@ -1679,9 +1634,8 @@ public enum NFC_Quick_Check_Values implements Named { private final PropertyNames names; private NFC_Quick_Check_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - NFC_Quick_Check_Values.class, this, shortName, otherNames); + names = new PropertyNames( + NFC_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1694,8 +1648,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(NFC_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFC_Quick_Check_Values.class); public static NFC_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1708,9 +1661,8 @@ public enum NFD_Quick_Check_Values implements Named { private final PropertyNames names; private NFD_Quick_Check_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - NFD_Quick_Check_Values.class, this, shortName, otherNames); + names = new PropertyNames( + NFD_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1723,15 +1675,14 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(NFD_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFD_Quick_Check_Values.class); public static NFD_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); } } - // NFKC_Casefold + // NFKC_Casefold public enum NFKC_Quick_Check_Values implements Named { Maybe("M"), No("N"), @@ -1739,9 +1690,8 @@ public enum NFKC_Quick_Check_Values implements Named { private final PropertyNames names; private NFKC_Quick_Check_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - NFKC_Quick_Check_Values.class, this, shortName, otherNames); + names = new PropertyNames( + NFKC_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1754,24 +1704,22 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(NFKC_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFKC_Quick_Check_Values.class); public static NFKC_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); } } - // NFKC_Simple_Casefold + // NFKC_Simple_Casefold public enum NFKD_Quick_Check_Values implements Named { No("N"), Yes("Y"); private final PropertyNames names; private NFKD_Quick_Check_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - NFKD_Quick_Check_Values.class, this, shortName, otherNames); + names = new PropertyNames( + NFKD_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1784,8 +1732,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(NFKD_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFKD_Quick_Check_Values.class); public static NFKD_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1800,9 +1747,8 @@ public enum Numeric_Type_Values implements Named { private final PropertyNames names; private Numeric_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Numeric_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Numeric_Type_Values.class, this, shortName, otherNames); } @Override @@ -1815,15 +1761,14 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Numeric_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Numeric_Type_Values.class); public static Numeric_Type_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Numeric_Value + // Numeric_Value public enum Other_Joining_Type_Values implements Named { Join_Causing("C"), Dual_Joining("D"), @@ -1835,9 +1780,8 @@ public enum Other_Joining_Type_Values implements Named { private final PropertyNames names; private Other_Joining_Type_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Other_Joining_Type_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Other_Joining_Type_Values.class, this, shortName, otherNames); } @Override @@ -1850,8 +1794,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Other_Joining_Type_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Other_Joining_Type_Values.class); public static Other_Joining_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -2046,9 +1989,8 @@ public enum Script_Values implements Named { private final PropertyNames names; private Script_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Script_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Script_Values.class, this, shortName, otherNames); } @Override @@ -2061,15 +2003,14 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Script_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Script_Values.class); public static Script_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Script_Extensions + // Script_Extensions public enum Sentence_Break_Values implements Named { ATerm("AT"), Close("CL"), @@ -2089,9 +2030,8 @@ public enum Sentence_Break_Values implements Named { private final PropertyNames names; private Sentence_Break_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Sentence_Break_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Sentence_Break_Values.class, this, shortName, otherNames); } @Override @@ -2104,22 +2044,21 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Sentence_Break_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Sentence_Break_Values.class); public static Sentence_Break_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Simple_Case_Folding - // Simple_Lowercase_Mapping - // Simple_Titlecase_Mapping - // Simple_Uppercase_Mapping - // Standardized_Variant - // Titlecase_Mapping - // Unicode_1_Name - // Uppercase_Mapping + // Simple_Case_Folding + // Simple_Lowercase_Mapping + // Simple_Titlecase_Mapping + // Simple_Uppercase_Mapping + // Standardized_Variant + // Titlecase_Mapping + // Unicode_1_Name + // Uppercase_Mapping public enum Vertical_Orientation_Values implements Named { Rotated("R"), Transformed_Rotated("Tr"), @@ -2128,9 +2067,8 @@ public enum Vertical_Orientation_Values implements Named { private final PropertyNames names; private Vertical_Orientation_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Vertical_Orientation_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Vertical_Orientation_Values.class, this, shortName, otherNames); } @Override @@ -2143,8 +2081,7 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Vertical_Orientation_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Vertical_Orientation_Values.class); public static Vertical_Orientation_Values forName(String name) { return NAME_MATCHER.get(name); @@ -2178,9 +2115,8 @@ public enum Word_Break_Values implements Named { private final PropertyNames names; private Word_Break_Values(String shortName, String... otherNames) { - names = - new PropertyNames( - Word_Break_Values.class, this, shortName, otherNames); + names = new PropertyNames( + Word_Break_Values.class, this, shortName, otherNames); } @Override @@ -2193,11 +2129,11 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = - PropertyNames.getNameToEnums(Word_Break_Values.class); + private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Word_Break_Values.class); public static Word_Break_Values forName(String name) { return NAME_MATCHER.get(name); } } + } From 9d8fb602704e847fcb0a5f61efe349d5847129af Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Sat, 8 Feb 2025 13:32:04 -0800 Subject: [PATCH 07/10] More review changes from Markus --- .../src/main/java/org/unicode/xml/UCDXML.java | 29 +++---------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java index 6bcfa74510..8cbe59d78d 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -140,7 +140,7 @@ public static void main(String[] args) throws Exception { destinationFolder = new File( options[OUTPUTFOLDER].value - + getVersionString(ucdVersion, 3) + + ucdVersion.getVersionString(3, 3) + "\\xmltest\\"); if (!destinationFolder.exists()) { if (!destinationFolder.mkdir()) { @@ -217,7 +217,7 @@ private static void buildUcdXMLFile( { writer.startElement("description"); { - writer.addContent("Unicode " + getVersionString(ucdVersion, 3)); + writer.addContent("Unicode " + ucdVersion.getVersionString(3, 3)); writer.endElement("description"); } buildRepertoire( @@ -668,7 +668,7 @@ private static AttributesImpl getAttributes( if (isAttributeIncluded) { String propName = prop.getShortName(); if (propName.startsWith("cjk")) { - propName = propName.substring(2); + propName = prop.getNames().getAllNames().get(1); } attributes.addAttribute(NAMESPACE, propName, propName, "CDATA", attrValue); } @@ -737,7 +737,7 @@ private static AttributesImpl getGroupAttributes( if (isAttributeIncluded) { String propName = prop.getShortName(); if (propName.startsWith("cjk")) { - propName = propName.substring(2); + propName = prop.getNames().getAllNames().get(1); } attributes.addAttribute( NAMESPACE, propName, propName, "CDATA", bestAttrValue); @@ -817,25 +817,4 @@ private static AttributesImpl getReservedAttributes( } return attributes; } - - private static String getVersionString(VersionInfo version, int maxDigits) { - if (maxDigits >= 1 && maxDigits <= 4) { - int[] digits = - new int[] { - version.getMajor(), - version.getMinor(), - version.getMilli(), - version.getMicro() - }; - StringBuilder verStr = new StringBuilder(7); - verStr.append(digits[0]); - for (int i = 1; i < maxDigits; ++i) { - verStr.append("."); - verStr.append(digits[i]); - } - return verStr.toString(); - } else { - throw new IllegalArgumentException("Invalid maxDigits range"); - } - } } From d6c8aa27a785f1468080c2105fedbdcd175bed2c Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Sat, 8 Feb 2025 13:49:03 -0800 Subject: [PATCH 08/10] Ran spotless --- .../java/org/unicode/props/UcdProperty.java | 117 ++-- .../org/unicode/props/UcdPropertyValues.java | 608 ++++++++++-------- 2 files changed, 416 insertions(+), 309 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index 73760b1d8d..ec3d513a3a 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -44,13 +44,13 @@ */ public enum UcdProperty { - // Numeric + // Numeric Numeric_Value(PropertyType.Numeric, "nv"), kAccountingNumeric(PropertyType.Numeric, "cjkAccountingNumeric"), kOtherNumeric(PropertyType.Numeric, "cjkOtherNumeric"), kPrimaryNumeric(PropertyType.Numeric, null, ValueCardinality.Ordered, "cjkPrimaryNumeric"), - // String + // String Bidi_Mirroring_Glyph(PropertyType.String, "bmg"), Bidi_Paired_Bracket(PropertyType.String, "bpb"), Case_Folding(PropertyType.String, "cf"), @@ -73,10 +73,12 @@ public enum UcdProperty { Titlecase_Mapping(PropertyType.String, "tc"), Uppercase_Mapping(PropertyType.String, "uc"), kCompatibilityVariant(PropertyType.String, "cjkCompatibilityVariant"), - kSimplifiedVariant(PropertyType.String, null, ValueCardinality.Unordered, "cjkSimplifiedVariant"), - kTraditionalVariant(PropertyType.String, null, ValueCardinality.Unordered, "cjkTraditionalVariant"), + kSimplifiedVariant( + PropertyType.String, null, ValueCardinality.Unordered, "cjkSimplifiedVariant"), + kTraditionalVariant( + PropertyType.String, null, ValueCardinality.Unordered, "cjkTraditionalVariant"), - // Miscellaneous + // Miscellaneous CJK_Radical(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "CJKR"), Emoji_DCM(PropertyType.Miscellaneous, "EDCM"), Emoji_KDDI(PropertyType.Miscellaneous, "EKDDI"), @@ -97,7 +99,11 @@ public enum UcdProperty { kAlternateJEF(PropertyType.Miscellaneous, "cjkAlternateJEF"), kAlternateKangXi(PropertyType.Miscellaneous, "cjkAlternateKangXi"), kAlternateMorohashi(PropertyType.Miscellaneous, "cjkAlternateMorohashi"), - kAlternateTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkAlternateTotalStrokes"), + kAlternateTotalStrokes( + PropertyType.Miscellaneous, + null, + ValueCardinality.Unordered, + "cjkAlternateTotalStrokes"), kBigFive(PropertyType.Miscellaneous, "cjkBigFive"), kCCCII(PropertyType.Miscellaneous, "cjkCCCII"), kCNS1986(PropertyType.Miscellaneous, "cjkCNS1986"), @@ -105,7 +111,8 @@ public enum UcdProperty { kCangjie(PropertyType.Miscellaneous, "cjkCangjie"), kCantonese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCantonese"), kCheungBauer(PropertyType.Miscellaneous, "cjkCheungBauer"), - kCheungBauerIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCheungBauerIndex"), + kCheungBauerIndex( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCheungBauerIndex"), kCihaiT(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCihaiT"), kCowles(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkCowles"), kDaeJaweon(PropertyType.Miscellaneous, "cjkDaeJaweon"), @@ -122,7 +129,8 @@ public enum UcdProperty { kFanqie(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFanqie"), kFenn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFenn"), kFennIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFennIndex"), - kFourCornerCode(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFourCornerCode"), + kFourCornerCode( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFourCornerCode"), kFrequency(PropertyType.Miscellaneous, "cjkFrequency"), kGB0(PropertyType.Miscellaneous, "cjkGB0"), kGB1(PropertyType.Miscellaneous, "cjkGB1"), @@ -162,7 +170,8 @@ public enum UcdProperty { kJapanese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapanese"), kJapaneseKun(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseKun"), kJapaneseOn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseOn"), - kJinmeiyoKanji(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJinmeiyoKanji"), + kJinmeiyoKanji( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJinmeiyoKanji"), kJis0(PropertyType.Miscellaneous, "cjkJis0"), kJis1(PropertyType.Miscellaneous, "cjkJis1"), kJoyoKanji(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJoyoKanji"), @@ -173,7 +182,11 @@ public enum UcdProperty { kKangXi(PropertyType.Miscellaneous, "cjkKangXi"), kKarlgren(PropertyType.Miscellaneous, "cjkKarlgren"), kKorean(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkKorean"), - kKoreanEducationHanja(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkKoreanEducationHanja"), + kKoreanEducationHanja( + PropertyType.Miscellaneous, + null, + ValueCardinality.Unordered, + "cjkKoreanEducationHanja"), kKoreanName(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkKoreanName"), kLau(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkLau"), kMainlandTelegraph(PropertyType.Miscellaneous, "cjkMainlandTelegraph"), @@ -185,21 +198,36 @@ public enum UcdProperty { kNelson(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkNelson"), kPhonetic(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkPhonetic"), kPseudoGB1(PropertyType.Miscellaneous, "cjkPseudoGB1"), - kRSAdobe_Japan1_6(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkRSAdobe_Japan1_6"), + kRSAdobe_Japan1_6( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkRSAdobe_Japan1_6"), kRSJapanese(PropertyType.Miscellaneous, "cjkRSJapanese"), kRSKanWa(PropertyType.Miscellaneous, "cjkRSKanWa"), kRSKangXi(PropertyType.Miscellaneous, "cjkRSKangXi"), kRSKorean(PropertyType.Miscellaneous, "cjkRSKorean"), kRSMerged(PropertyType.Miscellaneous, "cjkRSMerged"), kRSTUnicode(PropertyType.Miscellaneous, "kRSTUnicode"), - kRSUnicode(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkRSUnicode", "Unicode_Radical_Stroke", "URS"), + kRSUnicode( + PropertyType.Miscellaneous, + null, + ValueCardinality.Ordered, + "cjkRSUnicode", + "Unicode_Radical_Stroke", + "URS"), kReading(PropertyType.Miscellaneous, "kReading"), kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"), - kSMSZD2003Index(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"), - kSMSZD2003Readings(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"), - kSemanticVariant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"), - kSpecializedSemanticVariant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpecializedSemanticVariant"), - kSpoofingVariant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"), + kSMSZD2003Index( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"), + kSMSZD2003Readings( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"), + kSemanticVariant( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"), + kSpecializedSemanticVariant( + PropertyType.Miscellaneous, + null, + ValueCardinality.Unordered, + "cjkSpecializedSemanticVariant"), + kSpoofingVariant( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"), kSrc_NushuDuben(PropertyType.Miscellaneous, "kSrc_NushuDuben"), kStrange(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkStrange"), kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"), @@ -210,36 +238,48 @@ public enum UcdProperty { kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"), kUnihanCore2020(PropertyType.Miscellaneous, "cjkUnihanCore2020"), kVietnamese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnamese"), - kVietnameseNumeric(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"), + kVietnameseNumeric( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"), kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"), kXerox(PropertyType.Miscellaneous, "cjkXerox"), kZVariant(PropertyType.Miscellaneous, "cjkZVariant"), kZhuang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuang"), - kZhuangNumeric(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"), + kZhuangNumeric( + PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"), - // Catalog + // Catalog Age(PropertyType.Catalog, Age_Values.class, null, "age"), Block(PropertyType.Catalog, Block_Values.class, null, "blk"), Script(PropertyType.Catalog, Script_Values.class, null, "sc"), Script_Extensions(PropertyType.Catalog, Script_Values.class, ValueCardinality.Unordered, "scx"), - // Enumerated + // Enumerated Bidi_Class(PropertyType.Enumerated, Bidi_Class_Values.class, null, "bc"), - Bidi_Paired_Bracket_Type(PropertyType.Enumerated, Bidi_Paired_Bracket_Type_Values.class, null, "bpt"), - Canonical_Combining_Class(PropertyType.Enumerated, Canonical_Combining_Class_Values.class, null, "ccc"), + Bidi_Paired_Bracket_Type( + PropertyType.Enumerated, Bidi_Paired_Bracket_Type_Values.class, null, "bpt"), + Canonical_Combining_Class( + PropertyType.Enumerated, Canonical_Combining_Class_Values.class, null, "ccc"), Decomposition_Type(PropertyType.Enumerated, Decomposition_Type_Values.class, null, "dt"), - Do_Not_Emit_Type(PropertyType.Enumerated, Do_Not_Emit_Type_Values.class, null, "Do_Not_Emit_Type"), + Do_Not_Emit_Type( + PropertyType.Enumerated, Do_Not_Emit_Type_Values.class, null, "Do_Not_Emit_Type"), East_Asian_Width(PropertyType.Enumerated, East_Asian_Width_Values.class, null, "ea"), General_Category(PropertyType.Enumerated, General_Category_Values.class, null, "gc"), - Grapheme_Cluster_Break(PropertyType.Enumerated, Grapheme_Cluster_Break_Values.class, null, "GCB"), + Grapheme_Cluster_Break( + PropertyType.Enumerated, Grapheme_Cluster_Break_Values.class, null, "GCB"), Hangul_Syllable_Type(PropertyType.Enumerated, Hangul_Syllable_Type_Values.class, null, "hst"), Identifier_Status(PropertyType.Enumerated, Identifier_Status_Values.class, null, "ID_Status"), - Identifier_Type(PropertyType.Enumerated, Identifier_Type_Values.class, ValueCardinality.Unordered, "ID_Type"), + Identifier_Type( + PropertyType.Enumerated, + Identifier_Type_Values.class, + ValueCardinality.Unordered, + "ID_Type"), Idn_2008(PropertyType.Enumerated, Idn_2008_Values.class, null, "idn8"), Idn_Status(PropertyType.Enumerated, Idn_Status_Values.class, null, "idns"), Indic_Conjunct_Break(PropertyType.Enumerated, Indic_Conjunct_Break_Values.class, null, "InCB"), - Indic_Positional_Category(PropertyType.Enumerated, Indic_Positional_Category_Values.class, null, "InPC"), - Indic_Syllabic_Category(PropertyType.Enumerated, Indic_Syllabic_Category_Values.class, null, "InSC"), + Indic_Positional_Category( + PropertyType.Enumerated, Indic_Positional_Category_Values.class, null, "InPC"), + Indic_Syllabic_Category( + PropertyType.Enumerated, Indic_Syllabic_Category_Values.class, null, "InSC"), Joining_Group(PropertyType.Enumerated, Joining_Group_Values.class, null, "jg"), Joining_Type(PropertyType.Enumerated, Joining_Type_Values.class, null, "jt"), Line_Break(PropertyType.Enumerated, Line_Break_Values.class, null, "lb"), @@ -248,13 +288,14 @@ public enum UcdProperty { NFKC_Quick_Check(PropertyType.Enumerated, NFKC_Quick_Check_Values.class, null, "NFKC_QC"), NFKD_Quick_Check(PropertyType.Enumerated, NFKD_Quick_Check_Values.class, null, "NFKD_QC"), Numeric_Type(PropertyType.Enumerated, Numeric_Type_Values.class, null, "nt"), - Other_Joining_Type(PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"), + Other_Joining_Type( + PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"), Sentence_Break(PropertyType.Enumerated, Sentence_Break_Values.class, null, "SB"), Vertical_Orientation(PropertyType.Enumerated, Vertical_Orientation_Values.class, null, "vo"), Word_Break(PropertyType.Enumerated, Word_Break_Values.class, null, "WB"), kEH_Core(PropertyType.Enumerated, kEH_Core_Values.class, null, "kEH_Core"), - // Binary + // Binary ASCII_Hex_Digit(PropertyType.Binary, Binary.class, null, "AHex"), Alphabetic(PropertyType.Binary, Binary.class, null, "Alpha"), Basic_Emoji(PropertyType.Binary, Binary.class, null, "BE"), @@ -317,8 +358,10 @@ public enum UcdProperty { Prepended_Concatenation_Mark(PropertyType.Binary, Binary.class, null, "PCM"), Quotation_Mark(PropertyType.Binary, Binary.class, null, "QMark"), RGI_Emoji_Flag_Sequence(PropertyType.Binary, Binary.class, null, "REFS", "Emoji_Flag_Sequence"), - RGI_Emoji_Keycap_Sequence(PropertyType.Binary, Binary.class, null, "REKS", "Emoji_Keycap_Sequence"), - RGI_Emoji_Modifier_Sequence(PropertyType.Binary, Binary.class, null, "REMS", "Emoji_Modifier_Sequence"), + RGI_Emoji_Keycap_Sequence( + PropertyType.Binary, Binary.class, null, "REKS", "Emoji_Keycap_Sequence"), + RGI_Emoji_Modifier_Sequence( + PropertyType.Binary, Binary.class, null, "REMS", "Emoji_Modifier_Sequence"), RGI_Emoji_Tag_Sequence(PropertyType.Binary, Binary.class, null, "RETS", "Emoji_Tag_Sequence"), RGI_Emoji_Zwj_Sequence(PropertyType.Binary, Binary.class, null, "REZS", "Emoji_Zwj_Sequence"), Radical(PropertyType.Binary, Binary.class, null, "Radical"), @@ -335,17 +378,17 @@ public enum UcdProperty { kEH_NoMirror(PropertyType.Binary, Binary.class, null, "kEH_NoMirror"), kEH_NoRotate(PropertyType.Binary, Binary.class, null, "kEH_NoRotate"), - // Unknown - ; +// Unknown +; -private final PropertyType type; + private final PropertyType type; private final PropertyNames names; // for enums private final NameMatcher name2enum; private final EnumSet enums; private final Class enumClass; private final ValueCardinality cardinality; - + private UcdProperty(PropertyType type, String shortName, String... otherNames) { this.type = type; names = new PropertyNames(UcdProperty.class, this, shortName, otherNames); @@ -374,7 +417,7 @@ private UcdProperty( enumClass = classItem; } } - + public ValueCardinality getCardinality() { return cardinality; } diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index 1e1194a841..0aac98c263 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -17,8 +17,7 @@ public enum Binary implements Named { private final PropertyNames names; private Binary(String shortName, String... otherNames) { - names = new PropertyNames( - Binary.class, this, shortName, otherNames); + names = new PropertyNames(Binary.class, this, shortName, otherNames); } @Override @@ -31,7 +30,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Binary.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Binary.class); public static Binary forName(String name) { return NAME_MATCHER.get(name); @@ -63,7 +63,7 @@ public enum Age_Values implements Named { V12_1("12.1"), V13_0("13.0"), V13_1("13.1"), // TODO: there is no Unicode 13.1, see -// https://github.com/unicode-org/unicodetools/issues/100 + // https://github.com/unicode-org/unicodetools/issues/100 V14_0("14.0"), V15_0("15.0"), V15_1("15.1"), @@ -73,8 +73,7 @@ public enum Age_Values implements Named { private final PropertyNames names; private Age_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Age_Values.class, this, shortName, otherNames); + names = new PropertyNames(Age_Values.class, this, shortName, otherNames); } @Override @@ -87,7 +86,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Age_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Age_Values.class); public static Age_Values forName(String name) { return NAME_MATCHER.get(name); @@ -121,8 +121,9 @@ public enum Bidi_Class_Values implements Named { private final PropertyNames names; private Bidi_Class_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Bidi_Class_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Bidi_Class_Values.class, this, shortName, otherNames); } @Override @@ -135,15 +136,16 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Bidi_Class_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Bidi_Class_Values.class); public static Bidi_Class_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Bidi_Mirroring_Glyph - // Bidi_Paired_Bracket + // Bidi_Mirroring_Glyph + // Bidi_Paired_Bracket public enum Bidi_Paired_Bracket_Type_Values implements Named { Close("c"), None("n"), @@ -151,8 +153,9 @@ public enum Bidi_Paired_Bracket_Type_Values implements Named { private final PropertyNames names; private Bidi_Paired_Bracket_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Bidi_Paired_Bracket_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Bidi_Paired_Bracket_Type_Values.class, this, shortName, otherNames); } @Override @@ -165,7 +168,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Bidi_Paired_Bracket_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Bidi_Paired_Bracket_Type_Values.class); public static Bidi_Paired_Bracket_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -260,7 +264,8 @@ public enum Block_Values implements Named { Devanagari_Extended_A("Devanagari_Ext_A"), Combining_Diacritical_Marks("Diacriticals"), Combining_Diacritical_Marks_Extended("Diacriticals_Ext"), - Combining_Diacritical_Marks_For_Symbols("Diacriticals_For_Symbols", "Combining_Marks_For_Symbols"), + Combining_Diacritical_Marks_For_Symbols( + "Diacriticals_For_Symbols", "Combining_Marks_For_Symbols"), Combining_Diacritical_Marks_Supplement("Diacriticals_Sup"), Dingbats("Dingbats"), Dives_Akuru("Dives_Akuru"), @@ -524,8 +529,9 @@ public enum Block_Values implements Named { private final PropertyNames names; private Block_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Block_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Block_Values.class, this, shortName, otherNames); } @Override @@ -538,7 +544,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Block_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Block_Values.class); public static Block_Values forName(String name) { return NAME_MATCHER.get(name); @@ -607,8 +614,9 @@ public enum Canonical_Combining_Class_Values implements Named { private final PropertyNames names; private Canonical_Combining_Class_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Canonical_Combining_Class_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Canonical_Combining_Class_Values.class, this, shortName, otherNames); } @Override @@ -621,20 +629,21 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Canonical_Combining_Class_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Canonical_Combining_Class_Values.class); public static Canonical_Combining_Class_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Case_Folding - // CJK_Radical - // Confusable_MA - // Confusable_ML - // Confusable_SA - // Confusable_SL - // Decomposition_Mapping + // Case_Folding + // CJK_Radical + // Confusable_MA + // Confusable_ML + // Confusable_SA + // Confusable_SL + // Decomposition_Mapping public enum Decomposition_Type_Values implements Named { Canonical("Can", "can"), Compat("Com", "com"), @@ -657,8 +666,9 @@ public enum Decomposition_Type_Values implements Named { private final PropertyNames names; private Decomposition_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Decomposition_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Decomposition_Type_Values.class, this, shortName, otherNames); } @Override @@ -671,14 +681,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Decomposition_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Decomposition_Type_Values.class); public static Decomposition_Type_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Do_Not_Emit_Preferred + // Do_Not_Emit_Preferred public enum Do_Not_Emit_Type_Values implements Named { Indic_Atomic_Consonant("Indic_Atomic_Consonant"), Indic_Consonant_Conjunct("Indic_Consonant_Conjunct"), @@ -696,8 +707,9 @@ public enum Do_Not_Emit_Type_Values implements Named { private final PropertyNames names; private Do_Not_Emit_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Do_Not_Emit_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Do_Not_Emit_Type_Values.class, this, shortName, otherNames); } @Override @@ -710,7 +722,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Do_Not_Emit_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Do_Not_Emit_Type_Values.class); public static Do_Not_Emit_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -727,8 +740,9 @@ public enum East_Asian_Width_Values implements Named { private final PropertyNames names; private East_Asian_Width_Values(String shortName, String... otherNames) { - names = new PropertyNames( - East_Asian_Width_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + East_Asian_Width_Values.class, this, shortName, otherNames); } @Override @@ -741,19 +755,20 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(East_Asian_Width_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(East_Asian_Width_Values.class); public static East_Asian_Width_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Emoji_DCM - // Emoji_KDDI - // Emoji_SB - // emoji_variation_sequence - // Equivalent_Unified_Ideograph - // FC_NFKC_Closure + // Emoji_DCM + // Emoji_KDDI + // Emoji_SB + // emoji_variation_sequence + // Equivalent_Unified_Ideograph + // FC_NFKC_Closure public enum General_Category_Values implements Named { Other("C"), Control("Cc", "cntrl"), @@ -796,8 +811,9 @@ public enum General_Category_Values implements Named { private final PropertyNames names; private General_Category_Values(String shortName, String... otherNames) { - names = new PropertyNames( - General_Category_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + General_Category_Values.class, this, shortName, otherNames); } @Override @@ -810,7 +826,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(General_Category_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(General_Category_Values.class); public static General_Category_Values forName(String name) { return NAME_MATCHER.get(name); @@ -839,8 +856,9 @@ public enum Grapheme_Cluster_Break_Values implements Named { private final PropertyNames names; private Grapheme_Cluster_Break_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Grapheme_Cluster_Break_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Grapheme_Cluster_Break_Values.class, this, shortName, otherNames); } @Override @@ -853,7 +871,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Grapheme_Cluster_Break_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Grapheme_Cluster_Break_Values.class); public static Grapheme_Cluster_Break_Values forName(String name) { return NAME_MATCHER.get(name); @@ -870,8 +889,9 @@ public enum Hangul_Syllable_Type_Values implements Named { private final PropertyNames names; private Hangul_Syllable_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Hangul_Syllable_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Hangul_Syllable_Type_Values.class, this, shortName, otherNames); } @Override @@ -884,7 +904,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Hangul_Syllable_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Hangul_Syllable_Type_Values.class); public static Hangul_Syllable_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -897,8 +918,9 @@ public enum Identifier_Status_Values implements Named { private final PropertyNames names; private Identifier_Status_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Identifier_Status_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Identifier_Status_Values.class, this, shortName, otherNames); } @Override @@ -911,7 +933,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Identifier_Status_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Identifier_Status_Values.class); public static Identifier_Status_Values forName(String name) { return NAME_MATCHER.get(name); @@ -935,8 +958,9 @@ public enum Identifier_Type_Values implements Named { private final PropertyNames names; private Identifier_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Identifier_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Identifier_Type_Values.class, this, shortName, otherNames); } @Override @@ -949,7 +973,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Identifier_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Identifier_Type_Values.class); public static Identifier_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -963,8 +988,9 @@ public enum Idn_2008_Values implements Named { private final PropertyNames names; private Idn_2008_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Idn_2008_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Idn_2008_Values.class, this, shortName, otherNames); } @Override @@ -977,14 +1003,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Idn_2008_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Idn_2008_Values.class); public static Idn_2008_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Idn_Mapping + // Idn_Mapping public enum Idn_Status_Values implements Named { valid("v"), ignored("i"), @@ -996,8 +1023,9 @@ public enum Idn_Status_Values implements Named { private final PropertyNames names; private Idn_Status_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Idn_Status_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Idn_Status_Values.class, this, shortName, otherNames); } @Override @@ -1010,7 +1038,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Idn_Status_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Idn_Status_Values.class); public static Idn_Status_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1025,8 +1054,9 @@ public enum Indic_Conjunct_Break_Values implements Named { private final PropertyNames names; private Indic_Conjunct_Break_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Indic_Conjunct_Break_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Indic_Conjunct_Break_Values.class, this, shortName, otherNames); } @Override @@ -1039,7 +1069,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Indic_Conjunct_Break_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Indic_Conjunct_Break_Values.class); public static Indic_Conjunct_Break_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1066,8 +1097,9 @@ public enum Indic_Positional_Category_Values implements Named { private final PropertyNames names; private Indic_Positional_Category_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Indic_Positional_Category_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Indic_Positional_Category_Values.class, this, shortName, otherNames); } @Override @@ -1080,7 +1112,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Indic_Positional_Category_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Indic_Positional_Category_Values.class); public static Indic_Positional_Category_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1128,8 +1161,9 @@ public enum Indic_Syllabic_Category_Values implements Named { private final PropertyNames names; private Indic_Syllabic_Category_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Indic_Syllabic_Category_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Indic_Syllabic_Category_Values.class, this, shortName, otherNames); } @Override @@ -1142,14 +1176,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Indic_Syllabic_Category_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Indic_Syllabic_Category_Values.class); public static Indic_Syllabic_Category_Values forName(String name) { return NAME_MATCHER.get(name); } } - // ISO_Comment + // ISO_Comment public enum Jamo_Short_Name_Values implements Named { A("A"), AE("AE"), @@ -1206,8 +1241,9 @@ public enum Jamo_Short_Name_Values implements Named { private final PropertyNames names; private Jamo_Short_Name_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Jamo_Short_Name_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Jamo_Short_Name_Values.class, this, shortName, otherNames); } @Override @@ -1220,7 +1256,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Jamo_Short_Name_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Jamo_Short_Name_Values.class); public static Jamo_Short_Name_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1337,8 +1374,9 @@ public enum Joining_Group_Values implements Named { private final PropertyNames names; private Joining_Group_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Joining_Group_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Joining_Group_Values.class, this, shortName, otherNames); } @Override @@ -1351,7 +1389,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Joining_Group_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Joining_Group_Values.class); public static Joining_Group_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1368,8 +1407,9 @@ public enum Joining_Type_Values implements Named { private final PropertyNames names; private Joining_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Joining_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Joining_Type_Values.class, this, shortName, otherNames); } @Override @@ -1382,34 +1422,35 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Joining_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Joining_Type_Values.class); public static Joining_Type_Values forName(String name) { return NAME_MATCHER.get(name); } } - // kAccountingNumeric - // kAlternateHanYu - // kAlternateJEF - // kAlternateKangXi - // kAlternateMorohashi - // kAlternateTotalStrokes - // kBigFive - // kCangjie - // kCantonese - // kCCCII - // kCheungBauer - // kCheungBauerIndex - // kCihaiT - // kCNS1986 - // kCNS1992 - // kCompatibilityVariant - // kCowles - // kDaeJaweon - // kDefinition - // kEACC - // kEH_Cat + // kAccountingNumeric + // kAlternateHanYu + // kAlternateJEF + // kAlternateKangXi + // kAlternateMorohashi + // kAlternateTotalStrokes + // kBigFive + // kCangjie + // kCantonese + // kCCCII + // kCheungBauer + // kCheungBauerIndex + // kCihaiT + // kCNS1986 + // kCNS1992 + // kCompatibilityVariant + // kCowles + // kDaeJaweon + // kDefinition + // kEACC + // kEH_Cat public enum kEH_Core_Values implements Named { Core("C"), Legacy("L"), @@ -1417,8 +1458,9 @@ public enum kEH_Core_Values implements Named { private final PropertyNames names; private kEH_Core_Values(String shortName, String... otherNames) { - names = new PropertyNames( - kEH_Core_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + kEH_Core_Values.class, this, shortName, otherNames); } @Override @@ -1431,121 +1473,122 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(kEH_Core_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(kEH_Core_Values.class); public static kEH_Core_Values forName(String name) { return NAME_MATCHER.get(name); } } - // kEH_Desc - // kEH_Func - // kEH_FVal - // kEH_HG - // kEH_IFAO - // kEH_JSesh - // kEH_UniK - // kFanqie - // kFenn - // kFennIndex - // kFourCornerCode - // kFrequency - // kGB0 - // kGB1 - // kGB3 - // kGB5 - // kGB7 - // kGB8 - // kGradeLevel - // kGSR - // kHangul - // kHanYu - // kHanyuPinlu - // kHanyuPinyin - // kHDZRadBreak - // kHKGlyph - // kHKSCS - // kIBMJapan - // kIICore - // kIRG_GSource - // kIRG_HSource - // kIRG_JSource - // kIRG_KPSource - // kIRG_KSource - // kIRG_MSource - // kIRG_SSource - // kIRG_TSource - // kIRG_UKSource - // kIRG_USource - // kIRG_VSource - // kIRGDaeJaweon - // kIRGDaiKanwaZiten - // kIRGHanyuDaZidian - // kIRGKangXi - // kJa - // kJapanese - // kJapaneseKun - // kJapaneseOn - // kJHJ - // kJinmeiyoKanji - // kJis0 - // kJis1 - // kJIS0213 - // kJoyoKanji - // kKangXi - // kKarlgren - // kKorean - // kKoreanEducationHanja - // kKoreanName - // kKPS0 - // kKPS1 - // kKSC0 - // kKSC1 - // kLau - // kMainlandTelegraph - // kMandarin - // kMatthews - // kMeyerWempe - // kMojiJoho - // kMorohashi - // kNelson - // kOtherNumeric - // kPhonetic - // kPrimaryNumeric - // kPseudoGB1 - // kReading - // kRSAdobe_Japan1_6 - // kRSJapanese - // kRSKangXi - // kRSKanWa - // kRSKorean - // kRSMerged - // kRSTUnicode - // kRSUnicode - // kSBGY - // kSemanticVariant - // kSimplifiedVariant - // kSMSZD2003Index - // kSMSZD2003Readings - // kSpecializedSemanticVariant - // kSpoofingVariant - // kSrc_NushuDuben - // kStrange - // kTaiwanTelegraph - // kTang - // kTGH - // kTGHZ2013 - // kTGT_MergedSrc - // kTotalStrokes - // kTraditionalVariant - // kUnihanCore2020 - // kVietnamese - // kVietnameseNumeric - // kXerox - // kXHC1983 - // kZhuang - // kZhuangNumeric - // kZVariant + // kEH_Desc + // kEH_Func + // kEH_FVal + // kEH_HG + // kEH_IFAO + // kEH_JSesh + // kEH_UniK + // kFanqie + // kFenn + // kFennIndex + // kFourCornerCode + // kFrequency + // kGB0 + // kGB1 + // kGB3 + // kGB5 + // kGB7 + // kGB8 + // kGradeLevel + // kGSR + // kHangul + // kHanYu + // kHanyuPinlu + // kHanyuPinyin + // kHDZRadBreak + // kHKGlyph + // kHKSCS + // kIBMJapan + // kIICore + // kIRG_GSource + // kIRG_HSource + // kIRG_JSource + // kIRG_KPSource + // kIRG_KSource + // kIRG_MSource + // kIRG_SSource + // kIRG_TSource + // kIRG_UKSource + // kIRG_USource + // kIRG_VSource + // kIRGDaeJaweon + // kIRGDaiKanwaZiten + // kIRGHanyuDaZidian + // kIRGKangXi + // kJa + // kJapanese + // kJapaneseKun + // kJapaneseOn + // kJHJ + // kJinmeiyoKanji + // kJis0 + // kJis1 + // kJIS0213 + // kJoyoKanji + // kKangXi + // kKarlgren + // kKorean + // kKoreanEducationHanja + // kKoreanName + // kKPS0 + // kKPS1 + // kKSC0 + // kKSC1 + // kLau + // kMainlandTelegraph + // kMandarin + // kMatthews + // kMeyerWempe + // kMojiJoho + // kMorohashi + // kNelson + // kOtherNumeric + // kPhonetic + // kPrimaryNumeric + // kPseudoGB1 + // kReading + // kRSAdobe_Japan1_6 + // kRSJapanese + // kRSKangXi + // kRSKanWa + // kRSKorean + // kRSMerged + // kRSTUnicode + // kRSUnicode + // kSBGY + // kSemanticVariant + // kSimplifiedVariant + // kSMSZD2003Index + // kSMSZD2003Readings + // kSpecializedSemanticVariant + // kSpoofingVariant + // kSrc_NushuDuben + // kStrange + // kTaiwanTelegraph + // kTang + // kTGH + // kTGHZ2013 + // kTGT_MergedSrc + // kTotalStrokes + // kTraditionalVariant + // kUnihanCore2020 + // kVietnamese + // kVietnameseNumeric + // kXerox + // kXHC1983 + // kZhuang + // kZhuangNumeric + // kZVariant public enum Line_Break_Values implements Named { Ambiguous("AI"), Aksara("AK"), @@ -1598,8 +1641,9 @@ public enum Line_Break_Values implements Named { private final PropertyNames names; private Line_Break_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Line_Break_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Line_Break_Values.class, this, shortName, otherNames); } @Override @@ -1612,21 +1656,22 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Line_Break_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Line_Break_Values.class); public static Line_Break_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Lowercase_Mapping - // Name - // Name_Alias - // Named_Sequences - // Named_Sequences_Prov - // NC_Corrected - // NC_Original - // NC_Version + // Lowercase_Mapping + // Name + // Name_Alias + // Named_Sequences + // Named_Sequences_Prov + // NC_Corrected + // NC_Original + // NC_Version public enum NFC_Quick_Check_Values implements Named { Maybe("M"), No("N"), @@ -1634,8 +1679,9 @@ public enum NFC_Quick_Check_Values implements Named { private final PropertyNames names; private NFC_Quick_Check_Values(String shortName, String... otherNames) { - names = new PropertyNames( - NFC_Quick_Check_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + NFC_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1648,7 +1694,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFC_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(NFC_Quick_Check_Values.class); public static NFC_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1661,8 +1708,9 @@ public enum NFD_Quick_Check_Values implements Named { private final PropertyNames names; private NFD_Quick_Check_Values(String shortName, String... otherNames) { - names = new PropertyNames( - NFD_Quick_Check_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + NFD_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1675,14 +1723,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFD_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(NFD_Quick_Check_Values.class); public static NFD_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); } } - // NFKC_Casefold + // NFKC_Casefold public enum NFKC_Quick_Check_Values implements Named { Maybe("M"), No("N"), @@ -1690,8 +1739,9 @@ public enum NFKC_Quick_Check_Values implements Named { private final PropertyNames names; private NFKC_Quick_Check_Values(String shortName, String... otherNames) { - names = new PropertyNames( - NFKC_Quick_Check_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + NFKC_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1704,22 +1754,24 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFKC_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(NFKC_Quick_Check_Values.class); public static NFKC_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); } } - // NFKC_Simple_Casefold + // NFKC_Simple_Casefold public enum NFKD_Quick_Check_Values implements Named { No("N"), Yes("Y"); private final PropertyNames names; private NFKD_Quick_Check_Values(String shortName, String... otherNames) { - names = new PropertyNames( - NFKD_Quick_Check_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + NFKD_Quick_Check_Values.class, this, shortName, otherNames); } @Override @@ -1732,7 +1784,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(NFKD_Quick_Check_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(NFKD_Quick_Check_Values.class); public static NFKD_Quick_Check_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1747,8 +1800,9 @@ public enum Numeric_Type_Values implements Named { private final PropertyNames names; private Numeric_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Numeric_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Numeric_Type_Values.class, this, shortName, otherNames); } @Override @@ -1761,14 +1815,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Numeric_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Numeric_Type_Values.class); public static Numeric_Type_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Numeric_Value + // Numeric_Value public enum Other_Joining_Type_Values implements Named { Join_Causing("C"), Dual_Joining("D"), @@ -1780,8 +1835,9 @@ public enum Other_Joining_Type_Values implements Named { private final PropertyNames names; private Other_Joining_Type_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Other_Joining_Type_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Other_Joining_Type_Values.class, this, shortName, otherNames); } @Override @@ -1794,7 +1850,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Other_Joining_Type_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Other_Joining_Type_Values.class); public static Other_Joining_Type_Values forName(String name) { return NAME_MATCHER.get(name); @@ -1989,8 +2046,9 @@ public enum Script_Values implements Named { private final PropertyNames names; private Script_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Script_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Script_Values.class, this, shortName, otherNames); } @Override @@ -2003,14 +2061,15 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Script_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Script_Values.class); public static Script_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Script_Extensions + // Script_Extensions public enum Sentence_Break_Values implements Named { ATerm("AT"), Close("CL"), @@ -2030,8 +2089,9 @@ public enum Sentence_Break_Values implements Named { private final PropertyNames names; private Sentence_Break_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Sentence_Break_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Sentence_Break_Values.class, this, shortName, otherNames); } @Override @@ -2044,21 +2104,22 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Sentence_Break_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Sentence_Break_Values.class); public static Sentence_Break_Values forName(String name) { return NAME_MATCHER.get(name); } } - // Simple_Case_Folding - // Simple_Lowercase_Mapping - // Simple_Titlecase_Mapping - // Simple_Uppercase_Mapping - // Standardized_Variant - // Titlecase_Mapping - // Unicode_1_Name - // Uppercase_Mapping + // Simple_Case_Folding + // Simple_Lowercase_Mapping + // Simple_Titlecase_Mapping + // Simple_Uppercase_Mapping + // Standardized_Variant + // Titlecase_Mapping + // Unicode_1_Name + // Uppercase_Mapping public enum Vertical_Orientation_Values implements Named { Rotated("R"), Transformed_Rotated("Tr"), @@ -2067,8 +2128,9 @@ public enum Vertical_Orientation_Values implements Named { private final PropertyNames names; private Vertical_Orientation_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Vertical_Orientation_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Vertical_Orientation_Values.class, this, shortName, otherNames); } @Override @@ -2081,7 +2143,8 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Vertical_Orientation_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Vertical_Orientation_Values.class); public static Vertical_Orientation_Values forName(String name) { return NAME_MATCHER.get(name); @@ -2115,8 +2178,9 @@ public enum Word_Break_Values implements Named { private final PropertyNames names; private Word_Break_Values(String shortName, String... otherNames) { - names = new PropertyNames( - Word_Break_Values.class, this, shortName, otherNames); + names = + new PropertyNames( + Word_Break_Values.class, this, shortName, otherNames); } @Override @@ -2129,11 +2193,11 @@ public String getShortName() { return names.getShortName(); } - private static final NameMatcher NAME_MATCHER = PropertyNames.getNameToEnums(Word_Break_Values.class); + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Word_Break_Values.class); public static Word_Break_Values forName(String name) { return NAME_MATCHER.get(name); } } - } From 7aeb98163c9a37bd31a7d9f1afdc634344cee662 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Tue, 18 Feb 2025 16:07:52 -0800 Subject: [PATCH 09/10] Use default values where possible --- docs/ucdxml.md | 61 ++++++++++++++++--- .../unicode/xml/GeneratePropertyValues.java | 27 +++++--- .../src/main/java/org/unicode/xml/UCDXML.java | 36 ++++++++--- 3 files changed, 100 insertions(+), 24 deletions(-) diff --git a/docs/ucdxml.md b/docs/ucdxml.md index a8d1d1e954..6711254f4c 100644 --- a/docs/ucdxml.md +++ b/docs/ucdxml.md @@ -1,14 +1,62 @@ -# Generating TR42 +# UCDXML -## Step 1 - Generate property value fragments +There are three separate processes for generating and validating UCDXML files and their corresponding UAX42 report. -- mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.GeneratePropertyValues"' '-Dexec.args="--ucdversion 16.0.0 -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42/fragments; pwd)"' -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +1. Generate the UCDXML files. +2. (Optional) You can compare the generated UCDXML files against each other (e.g., Flat vs Grouped) or against + previous versions. +3. Generate UAX42. There are three steps involved: -## Step 2 - Generate TR42 index.html and index.rnc + 1. Generate the property value fragments. The updated versions should live in + unicodetools/src/main/resources/org/unicode/uax42/fragments + 2. Generate the index.html and index.rnc files for UAX42. + 3. (Optional) Validate the UCDXML files using index.rnc. -- mvn xml:transform -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42/fragments; pwd) -Doutputdir=../Generated/uax42/ +## Generate UCDXML files -## Step 3 - Validate generated UAX XML files +- You can generate flat or grouped versions of UCDXML. +- You can generate UCDXML files for: + - the full range of code points + - the Unihan code points + - code points that are not Unihan code points + +``` +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range ALL --output FLAT"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range UNIHAN --output FLAT"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range NOUNIHAN --output FLAT"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range ALL --output GROUPED"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range UNIHAN --output GROUPED"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range NOUNIHAN --output GROUPED"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +``` + +## Compare UCDXML files + +After generating UCDXML files, you can compare: + +- Different versions of the same type (range and output) of UCDXML file +- Grouped and flat versions of the same code point range + +``` +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.CompareUCDXML"' '-Dexec.args="-a {path to file} -b {path to file}"' +``` + +## Generating TR42 + +### Step 1 - Generate property value fragments + +``` +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.GeneratePropertyValues"' -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +``` + +UAX42 fragments live in unicodetools/src/main/resources/org/unicode/uax42/fragments + +### Step 2 - Generate TR42 index.html and index.rnc + +``` +mvn xml:transform -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42; pwd) -Doutputdir=$(cd ../Generated/uax42; pwd) +``` + +### Step 3 - Validate generated UAX XML files You'll need a [RELAX NG](https://relaxng.org/) schema validator. We'll use [jing-trang](https://github.com/relaxng/jing-trang) in this example. @@ -19,4 +67,3 @@ We'll use [jing-trang](https://github.com/relaxng/jing-trang) in this example. java -jar C:\_git\jing-trang\build\jing.jar -c UNICODETOOLS_REPO_DIR\uax\uax42\output\index.rnc ``` Note that the UAX xml file has to be saved as NFD as the Unihan syntax regular expressions are expecting NFD. - diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java index 8b51c7350b..0d28734b08 100644 --- a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -21,6 +21,7 @@ import org.unicode.props.PropertyParsingInfo; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.*; +import org.unicode.text.utility.Settings; /** * Utility for generating fragments that describe the property values in a format that can be @@ -109,7 +110,7 @@ public String getSyntax() { private static final String TR38URL = "https://www.unicode.org/reports/tr38"; private static final UOption[] options = { UOption.HELP_H(), - UOption.create("ucdversion", 'v', UOption.REQUIRES_ARG), + UOption.create("ucdversion", 'v', UOption.OPTIONAL_ARG), UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) }; @@ -123,7 +124,7 @@ public static void main(String[] args) throws Exception { if (options[HELP].doesOccur) { System.out.println( - "GeneratePropertyValuesList --ucdversion {version number} --outputfolder {destination}"); + "GeneratePropertyValuesList [--ucdversion {version number}] [--outputfolder {destination}]"); System.exit(0); } @@ -138,14 +139,13 @@ public static void main(String[] args) throws Exception { + " to a valid UCD version"); } } else { - throw new IllegalArgumentException( - "Missing command line option: --ucdversion (or -v)"); + ucdVersion = VersionInfo.getInstance(Settings.latestVersion); } if (options[OUTPUTFOLDER].doesOccur) { try { destinationFolder = new File(options[OUTPUTFOLDER].value); if (!destinationFolder.exists()) { - if (!destinationFolder.mkdir()) { + if (!destinationFolder.mkdirs()) { throw new IOException(); } } @@ -154,8 +154,19 @@ public static void main(String[] args) throws Exception { "Could not find or create " + options[OUTPUTFOLDER].value); } } else { - throw new IllegalArgumentException( - "Missing command line option: --outputfolder (or -f)"); + try { + destinationFolder = new File(Settings.Output.GEN_DIR + "uax42\\fragments\\"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + + Settings.Output.GEN_DIR + + "uax42\\fragments\\"); + } } } catch (Exception e) { @@ -168,7 +179,7 @@ public static void main(String[] args) throws Exception { System.out.println("End"); System.exit(0); } else { - System.err.println("Unexpected error when building UcdXML file."); + System.err.println("Unexpected error when generating uax42 fragment files."); System.exit(1); } } diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java index 8cbe59d78d..d4c302e1d7 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -23,6 +23,7 @@ import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; +import org.unicode.text.utility.Settings; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -66,10 +67,10 @@ public String toString() { private static final UOption[] options = { UOption.HELP_H(), - UOption.create("ucdversion", 'v', UOption.REQUIRES_ARG), + UOption.create("ucdversion", 'v', UOption.OPTIONAL_ARG), UOption.create("range", 'r', UOption.REQUIRES_ARG), UOption.create("output", 'o', UOption.REQUIRES_ARG), - UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + UOption.create("outputfolder", 'f', UOption.OPTIONAL_ARG) }; private static final int HELP = 0, UCDVERSION = 1, RANGE = 2, OUTPUT = 3, OUTPUTFOLDER = 4; @@ -88,7 +89,7 @@ public static void main(String[] args) throws Exception { if (options[HELP].doesOccur) { System.out.println( - "UCDXML --ucdversion {version number} --outputfolder {destination} " + "UCDXML [--ucdversion {version number}] [--outputfolder {destination}] " + "--range [ALL|NOUNIHAN|UNIHAN] --output [FLAT|GROUPED]"); System.exit(0); } @@ -104,8 +105,7 @@ public static void main(String[] args) throws Exception { + " to a valid UCD version"); } } else { - throw new IllegalArgumentException( - "Missing command line option: --ucdversion (or -v)"); + ucdVersion = VersionInfo.getInstance(Settings.latestVersion); } if (options[RANGE].doesOccur) { try { @@ -141,9 +141,9 @@ public static void main(String[] args) throws Exception { new File( options[OUTPUTFOLDER].value + ucdVersion.getVersionString(3, 3) - + "\\xmltest\\"); + + "/"); if (!destinationFolder.exists()) { - if (!destinationFolder.mkdir()) { + if (!destinationFolder.mkdirs()) { throw new IOException(); } } @@ -152,8 +152,26 @@ public static void main(String[] args) throws Exception { "Could not find or create " + options[OUTPUTFOLDER].value); } } else { - throw new IllegalArgumentException( - "Missing command line option: --outputfolder (or -f)"); + try { + destinationFolder = + new File( + Settings.Output.GEN_DIR + + "ucdxml\\" + + ucdVersion.getVersionString(3, 3) + + "\\"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + + Settings.Output.GEN_DIR + + "ucdxml\\" + + ucdVersion.getVersionString(3, 3) + + "\\"); + } } } catch (Exception e) { From 9b4e913703fbd4052aefd7358c334579713e3db1 Mon Sep 17 00:00:00 2001 From: John Wilcock Date: Fri, 21 Mar 2025 09:55:27 -0700 Subject: [PATCH 10/10] Corrections from Markus's review --- .../java/org/unicode/props/UcdProperty.java | 11 +++-- .../org/unicode/props/UcdPropertyValues.java | 6 +-- .../unicode/xml/GeneratePropertyValues.java | 36 +++++---------- .../org/unicode/xml/UCDSectionDetail.java | 4 +- .../src/main/java/org/unicode/xml/UCDXML.java | 44 ++++++------------- .../java/org/unicode/xml/XMLProperties.java | 2 +- .../unicode/props/ExtraPropertyAliases.txt | 8 ++-- .../org/unicode/props/IndexPropertyRegex.txt | 6 +-- .../unicode/props/IndexUnicodeProperties.txt | 6 +-- .../org/unicode/uax42/fragments/blk.xml | 9 ++++ .../org/unicode/uax42/fragments/joining.xml | 2 +- .../org/unicode/uax42/fragments/script.xml | 17 +++---- 12 files changed, 67 insertions(+), 84 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index ec3d513a3a..26a39ed217 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -85,16 +85,13 @@ public enum UcdProperty { Emoji_SB(PropertyType.Miscellaneous, "ESB"), ISO_Comment(PropertyType.Miscellaneous, "isc"), Jamo_Short_Name(PropertyType.Miscellaneous, "JSN"), - NC_Corrected(PropertyType.Miscellaneous, "ncCorrected"), - NC_Original(PropertyType.Miscellaneous, "ncOriginal"), - NC_Version(PropertyType.Miscellaneous, "ncVersion"), Name(PropertyType.Miscellaneous, "na"), Name_Alias(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "Name_Alias"), Named_Sequences(PropertyType.Miscellaneous, "NS"), Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"), Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"), Unicode_1_Name(PropertyType.Miscellaneous, "na1"), - emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"), + emoji_variation_sequence(PropertyType.Miscellaneous, "emoji_variation_sequence"), kAlternateHanYu(PropertyType.Miscellaneous, "cjkAlternateHanYu"), kAlternateJEF(PropertyType.Miscellaneous, "cjkAlternateJEF"), kAlternateKangXi(PropertyType.Miscellaneous, "cjkAlternateKangXi"), @@ -246,6 +243,12 @@ public enum UcdProperty { kZhuang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuang"), kZhuangNumeric( PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"), + normalization_correction_corrected( + PropertyType.Miscellaneous, "normalization_correction_corrected"), + normalization_correction_original( + PropertyType.Miscellaneous, "normalization_correction_original"), + normalization_correction_version( + PropertyType.Miscellaneous, "normalization_correction_version"), // Catalog Age(PropertyType.Catalog, Age_Values.class, null, "age"), diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index 0aac98c263..f47a2cfb84 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -1669,9 +1669,6 @@ public static Line_Break_Values forName(String name) { // Name_Alias // Named_Sequences // Named_Sequences_Prov - // NC_Corrected - // NC_Original - // NC_Version public enum NFC_Quick_Check_Values implements Named { Maybe("M"), No("N"), @@ -1792,6 +1789,9 @@ public static NFKD_Quick_Check_Values forName(String name) { } } + // normalization_correction_corrected + // normalization_correction_original + // normalization_correction_version public enum Numeric_Type_Values implements Named { Decimal("De"), Digit("Di"), diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java index 0d28734b08..9a360c178f 100644 --- a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -111,7 +111,7 @@ public String getSyntax() { private static final UOption[] options = { UOption.HELP_H(), UOption.create("ucdversion", 'v', UOption.OPTIONAL_ARG), - UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + UOption.create("outputfolder", 'f', UOption.OPTIONAL_ARG) }; private static final int HELP = 0, UCDVERSION = 1, OUTPUTFOLDER = 2; @@ -141,32 +141,18 @@ public static void main(String[] args) throws Exception { } else { ucdVersion = VersionInfo.getInstance(Settings.latestVersion); } - if (options[OUTPUTFOLDER].doesOccur) { - try { - destinationFolder = new File(options[OUTPUTFOLDER].value); - if (!destinationFolder.exists()) { - if (!destinationFolder.mkdirs()) { - throw new IOException(); - } + destinationFolder = + options[OUTPUTFOLDER].doesOccur + ? new File(options[OUTPUTFOLDER].value) + : new File(Settings.Output.GEN_DIR + "uax42/fragments/"); + try { + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); } - } catch (Exception e) { - throw new IllegalArgumentException( - "Could not find or create " + options[OUTPUTFOLDER].value); - } - } else { - try { - destinationFolder = new File(Settings.Output.GEN_DIR + "uax42\\fragments\\"); - if (!destinationFolder.exists()) { - if (!destinationFolder.mkdirs()) { - throw new IOException(); - } - } - } catch (Exception e) { - throw new IllegalArgumentException( - "Could not find or create " - + Settings.Output.GEN_DIR - + "uax42\\fragments\\"); } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find or create " + destinationFolder); } } catch (Exception e) { diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java index 6db3cf82bb..1c87f14eb3 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java @@ -160,7 +160,9 @@ public boolean getParserWithMissing() { UcdSection.NORMALIZATIONCORRECTIONS, new UCDSectionComponent[] { new UCDSectionComponent( - VersionInfo.getInstance(1, 1, 0), null, UcdProperty.NC_Original) + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.normalization_correction_original) }, 2); public static UCDSectionDetail StandardizedVariants_Detail = diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java index d4c302e1d7..3ec7de10e7 100644 --- a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -135,43 +135,25 @@ public static void main(String[] args) throws Exception { + " to one of [FLAT|GROUPED]"); } } - if (options[OUTPUTFOLDER].doesOccur) { - try { - destinationFolder = - new File( + destinationFolder = + options[OUTPUTFOLDER].doesOccur + ? new File( options[OUTPUTFOLDER].value + ucdVersion.getVersionString(3, 3) - + "/"); - if (!destinationFolder.exists()) { - if (!destinationFolder.mkdirs()) { - throw new IOException(); - } - } - } catch (Exception e) { - throw new IllegalArgumentException( - "Could not find or create " + options[OUTPUTFOLDER].value); - } - } else { - try { - destinationFolder = - new File( + + "/") + : new File( Settings.Output.GEN_DIR - + "ucdxml\\" + + "ucdxml/" + ucdVersion.getVersionString(3, 3) - + "\\"); - if (!destinationFolder.exists()) { - if (!destinationFolder.mkdirs()) { - throw new IOException(); - } + + "/"); + try { + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); } - } catch (Exception e) { - throw new IllegalArgumentException( - "Could not find or create " - + Settings.Output.GEN_DIR - + "ucdxml\\" - + ucdVersion.getVersionString(3, 3) - + "\\"); } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find or create " + destinationFolder); } } catch (Exception e) { diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java index d1f6e178e8..f103a56f49 100644 --- a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -280,7 +280,7 @@ public void startElement( + " version: " + attributes.get("version"); cps = Utility.fromHex(attributes.get("cp")); - appendProp(cps, UcdProperty.NC_Original, correction); + appendProp(cps, UcdProperty.normalization_correction_original, correction); break; case INSTEAD: final String instead = diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt index 0f9cbda3dc..6f2bb09e20 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt @@ -67,7 +67,7 @@ CJKR ; CJK_Radical EDCM ; Emoji_DCM EKDDI ; Emoji_KDDI ESB ; Emoji_SB -EVS ; emoji_variation_sequence +emoji_variation_sequence ; emoji_variation_sequence NS ; Named_Sequences NSP ; Named_Sequences_Prov SV ; Standardized_Variant @@ -161,9 +161,9 @@ cjkJoyoKanji ; kJoyoKanji cjkKoreanEducationHanja ; kKoreanEducationHanja cjkKoreanName ; kKoreanName cjkTGH ; kTGH -ncCorrected ; NC_Corrected -ncOriginal ; NC_Original -ncVersion ; NC_Version +normalization_correction_original ; normalization_correction_original +normalization_correction_corrected ; normalization_correction_corrected +normalization_correction_version ; normalization_correction_version # 13.0 cjkSpoofingVariant ; kSpoofingVariant cjkTGHZ2013 ; kTGHZ2013 diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt index e280c7ff2b..07b816b352 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt @@ -206,9 +206,9 @@ kReading ; SINGLE_VALUED ; [a-z]{1,6}[1-6]+ kRSTUnicode ; SINGLE_VALUED ; [0-9]+\.[0-9]+ kTGT_MergedSrc ; SINGLE_VALUED ; L2008-[0-9A-F]{4,5}(-[0-9]{4,5})? -NC_Original ; SINGLE_VALUED ; [0-9A-F]{4,5} -NC_Corrected ; SINGLE_VALUED ; [0-9A-F]{4,5} -NC_Version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9] +normalization_correction_original ; SINGLE_VALUED ; [0-9A-F]{4,5} +normalization_correction_corrected ; SINGLE_VALUED ; [0-9A-F]{4,5} +normalization_correction_version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9] # ============================= # Catalog/Enum/Binary Properties diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt index 5ff7cbdf58..c44f2caf5d 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt @@ -328,9 +328,9 @@ NushuSources ; kReading TangutSources ; kRSTUnicode TangutSources ; kTGT_MergedSrc -NormalizationCorrections ; NC_Original -NormalizationCorrections ; NC_Corrected -NormalizationCorrections ; NC_Version +NormalizationCorrections ; normalization_correction_original +NormalizationCorrections ; normalization_correction_corrected +NormalizationCorrections ; normalization_correction_version # Properties removed from Unihan before 5.1. # Point to a nonexistent file so that we don’t try to read them from the most recent monolithic diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml index ecd721a634..1a1872cb87 100644 --- a/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml @@ -28,6 +28,7 @@ | "Bassa_Vah" | "Batak" | "Bengali" + | "Beria_Erfe" | "Bhaiksuki" | "Block_Elements" | "Bopomofo" @@ -45,6 +46,7 @@ | "Cherokee" | "Cherokee_Sup" | "Chess_Symbols" + | "Chisoi" | "Chorasmian" | "CJK" | "CJK_Compat" @@ -60,6 +62,7 @@ | "CJK_Ext_G" | "CJK_Ext_H" | "CJK_Ext_I" + | "CJK_Ext_J" | "CJK_Radicals_Sup" | "CJK_Strokes" | "CJK_Symbols" @@ -209,6 +212,7 @@ | "Misc_Math_Symbols_B" | "Misc_Pictographs" | "Misc_Symbols" + | "Misc_Symbols_Sup" | "Misc_Technical" | "Modi" | "Modifier_Letters" @@ -268,9 +272,11 @@ | "Samaritan" | "Saurashtra" | "Sharada" + | "Sharada_Sup" | "Shavian" | "Shorthand_Format_Controls" | "Siddham" + | "Sidetic" | "Sinhala" | "Sinhala_Archaic_Numbers" | "Small_Forms" @@ -305,12 +311,14 @@ | "Tai_Tham" | "Tai_Viet" | "Tai_Xuan_Jing" + | "Tai_Yo" | "Takri" | "Tamil" | "Tamil_Sup" | "Tangsa" | "Tangut" | "Tangut_Components" + | "Tangut_Components_Sup" | "Tangut_Sup" | "Telugu" | "Thaana" @@ -319,6 +327,7 @@ | "Tifinagh" | "Tirhuta" | "Todhri" + | "Tolong_Siki" | "Toto" | "Transport_And_Map" | "Tulu_Tigalari" diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml index 184fcca14d..fb985f6a6d 100644 --- a/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml @@ -43,7 +43,7 @@ | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" - | "Teth" | "Thin_Yeh" + | "Teth" | "Thin_Noon" | "Thin_Yeh" | "Vertical_Tail" | "Waw" | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml index b22243aaf8..dfadfe3b02 100644 --- a/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml @@ -2,10 +2,10 @@ script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" | "Avst" - | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" - | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" - | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" - | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Berf" + | "Bhks" | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chis" + | "Chrs" | "Copt" | "Cpmn" | "Cprt" | "Cyrl" | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" | "Egyp" | "Elba" | "Elym" | "Ethi" | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" @@ -29,11 +29,12 @@ | "Phnx" | "Plrd" | "Prti" | "Rjng" | "Rohg" | "Runr" | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" - | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" - | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Sidd" | "Sidt" | "Sind" | "Sinh" | "Sogd" | "Sogo" + | "Sora" | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" - | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" - | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Tavt" | "Tayo" | "Telu" | "Tfng" | "Tglg" | "Thaa" + | "Thai" | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Tols" + | "Toto" | "Tutg" | "Ugar" | "Vaii" | "Vith" | "Wara" | "Wcho"