Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import java.util.Map;
import java.util.regex.Pattern;
import org.unicode.cldr.util.MultiComparator;
import org.unicode.props.IndexUnicodeProperties;
import org.unicode.props.UcdProperty;
import org.unicode.props.UcdPropertyValues;
import org.unicode.props.UnicodeProperty;
import org.unicode.props.UnicodeProperty.PatternMatcher;
Expand Down Expand Up @@ -340,6 +342,15 @@ private boolean applyPropertyAlias0(
}
}
set = prop.getSet(propertyValue);
if (set.isEmpty()
&& prop instanceof IndexUnicodeProperties.IndexUnicodeProperty
&& prop.getName().equals("Name")) {
set =
((IndexUnicodeProperties.IndexUnicodeProperty) prop)
.getFactory()
.getProperty(UcdProperty.Name_Alias)
.getSet(propertyValue);
}
}
} else if (isAge) {
set = new UnicodeSet();
Expand Down
47 changes: 47 additions & 0 deletions UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,53 @@ public void TestInteriorlyNegatedComparison() {
"[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]");
}

@Test
public void TestNameMatching() {
// UAX44-LM2 for both Name and Name_Alias.
checkSetsEqual("\\p{Name=NO-BREAK SPACE}", "[\\xA0]");
checkSetsEqual("\\p{Name=no break space}", "[\\xA0]");
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-E}", "[\\u1180]");
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]");
checkSetsEqual("\\p{Name=Hangul jungseong o-e}", "[\\u1180]");
checkSetsEqual("\\p{Name=Hangul jungseong oe}", "[\\u116C]");
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -E}", "[\\u1180]");
checkSetsEqual("\\p{Name= HANGUL JUNGSEONG O-E }", "[\\u1180]");
checkSetsEqual("\\p{Name=_HANGUL_JUNGSEONG_O-E_}", "[\\u1180]");
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-EO}", "[\\u117F]");
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE O}", "[\\u117F]");
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -EO}", "[]");
checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]");
checkSetsEqual("\\p{Name=MARCHEN_LETTER_-A}", "[\\x{11C88}]");
checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]");
checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]");
checkSetsEqual("\\p{Name=TIBETAN MARK TSA PHRU}", "[]");
checkSetsEqual("\\p{Name=TIBETAN MARK BKA- SHOG YIG MGO}", "[\\u0F0A]");
checkSetsEqual("\\p{Name=TIBETAN MARK BKA SHOG YIG MGO}", "[]");
checkSetsEqual("\\p{Name_Alias=newline}", "[\\x0A]");
checkSetsEqual("\\p{Name_Alias=NEW LINE}", "[\\x0A]");
// The medial hyphen is only significant in HANGUL JUNGSEONG O-E, not in arbitrary O-E/OE.
checkSetsEqual("\\p{Name=twoemdash}", "⸺");
checkSetsEqual("\\p{Name=SeeNoEvil_Monkey}", "🙈");
checkSetsEqual("\\p{Name=BALLET S-H-O-E-S}", "🩰");
checkSetsEqual("[\\p{Name=LATIN SMALL LIGATURE O-E}uf]", "[œuf]");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🥚

}

@Test
public void TestNameAliases() {
// Name_Alias values behave as aliases for Name, but not vice-versa.
checkSetsEqual(
"\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]");
checkSetsEqual(
"\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]");
checkSetsEqual(
"\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}",
"[]");
checkSetsEqual(
"\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}",
"[︘]");
checkProperties("\\p{Name_Alias=@none@}", "[a-z]");
}

@Test
public void TestIdentityQuery() {
checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ public Map<UcdProperty, Long> getCacheFileSize() {
static final Transform<String, String> fromNumericPinyin =
Transliterator.getInstance("NumericPinyin-Latin;nfc");

static final Merge<String> MULTIVALUED_JOINER = new PropertyUtilities.Joiner("|");
static final Merge<String> ALPHABETIC_JOINER =
new Merge<String>() {
TreeSet<String> sorted = new TreeSet<String>();
Expand Down Expand Up @@ -684,7 +685,7 @@ public VersionInfo getUcdVersion() {
// .get(toSkeleton(propertyAlias));
// }

class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
public class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {

private final UcdProperty prop;
private final Map<String, PropertyNames> stringToNamedEnum;
Expand Down Expand Up @@ -724,6 +725,10 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
}
}

public IndexUnicodeProperties getFactory() {
return IndexUnicodeProperties.this;
}

@Override
public boolean isTrivial() {
return _getRawUnicodeMap().isEmpty()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ private static void parsePropertyValueFile(
&& indexUnicodeProperties.ucdVersion.compareTo(
VersionInfo.UNICODE_4_0)
<= 0
? new PropertyUtilities.Joiner("|")
? IndexUnicodeProperties.MULTIVALUED_JOINER
: null;
final var originalMultivaluedSplit = propInfo.multivaluedSplit;
// The first version of kPrimaryNumeric had spaces in values.
Expand Down Expand Up @@ -995,7 +995,7 @@ private static void parseNameAliasesFile(
indexUnicodeProperties,
nextProperties,
propInfoSet,
IndexUnicodeProperties.ALPHABETIC_JOINER,
IndexUnicodeProperties.MULTIVALUED_JOINER,
false);
}
}
Expand Down
81 changes: 68 additions & 13 deletions unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
? NULL_MATCHER
: new SimpleMatcher(
propertyValue,
isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR),
getName().equals("Name") || getName().equals("Name_Alias")
? CHARACTER_NAME_COMPARATOR
: isType(STRING_OR_MISC_MASK)
? null
: PROPERTY_COMPARATOR),
result);
}
}
Expand Down Expand Up @@ -720,39 +724,90 @@ public static String toSkeleton(String source) {
return skeletonBuffer.toString();
}

/** Returns a representative of the equivalence class of source under UAX44-LM2. */
public static String toNameSkeleton(String source) {
public static final Comparator<String> CHARACTER_NAME_COMPARATOR =
new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return compareCharacterNames(o1, o2);
}
};

public static int compareCharacterNames(String a, String b) {
if (a == b) return 0;
if (a == null) return -1;
if (b == null) return 1;
return toNameSkeleton(a, false).compareTo(toNameSkeleton(b, false));
}

/**
* Returns a representative of the equivalence class of source under UAX44-LM2. If
* validate=true, checks that source contains only characters allowed in character names.
*/
public static String toNameSkeleton(String source, boolean validate) {
if (source == null) return null;
StringBuffer result = new StringBuffer();
// remove spaces, medial '-'
// we can do this with char, since no surrogates are involved
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
final char uppercase = Character.toUpperCase(ch);
if (validate && uppercase != ch) {
throw new IllegalArgumentException(
"Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
}
ch = uppercase;
if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') {
result.append(ch);
} else if (ch == ' ') {
// don't copy ever
} else if (ch == '-') {
// only copy non-medials AND trailing O-E
if (0 == i
|| i == source.length() - 1
|| source.charAt(i - 1) == ' '
|| source.charAt(i + 1) == ' '
|| (i == source.length() - 2
&& source.charAt(i - 1) == 'O'
&& source.charAt(i + 1) == 'E')) {
System.out.println("****** EXCEPTION " + source);
// Only copy a hyphen-minus if it is non-medial, or if it is
// the hyphen in U+1180 HANGUL JUNGSEONG O-E.
boolean medial;
if (0 == i || i == source.length() - 1) {
medial = false; // Name-initial or name-final.
} else {
final char preceding = Character.toUpperCase(source.charAt(i - 1));
final char following = Character.toUpperCase(source.charAt(i + 1));
medial =
(('0' <= preceding && preceding <= '9')
|| ('A' <= preceding && preceding <= 'Z'))
&& (('0' <= following && following <= '9')
|| ('A' <= following && following <= 'Z'));
}
boolean is1180 = false;
if (medial
&& i <= source.length() - 2
&& Character.toUpperCase(source.charAt(i + 1)) == 'E'
&& result.toString().equals("HANGULJUNGSEONGO")) {
is1180 = true;
for (int j = i + 2; j < source.length(); ++j) {
if (source.charAt(j) != ' ' && source.charAt(j) != '_') {
is1180 = false;
}
}
}
if (!medial || is1180) {
if (validate) {
System.out.println("****** EXCEPTION " + source);
}
result.append(ch);
}
// otherwise don't copy
} else {
} else if (validate) {
throw new IllegalArgumentException(
"Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
} else if (ch != '_') {
result.append(Character.toUpperCase(ch));
}
}
return result.toString();
}

public static String toNameSkeleton(String source) {
return toNameSkeleton(source, true);
}

/**
* These routines use the Java functions, because they only need to act on ASCII Changes space,
* - into _, inserts _ between lower and UPPER.
Expand Down
Loading