Skip to content

Commit 1450868

Browse files
eggrobinmarkusicu
andauthored
LM2 Name and Name_Alias matching (#1059)
* Don’t invert twice on comparison queries, add support for null queries, align identity queries with the draft * more tests * LM3 is * comments * out of bounds * Check lb=@none@ (though that should probably be an error). * Millionfold falsification * Need to figure out how to make Name_Alias behave as an alias for Name… * Name_Alias as a Name alias; failing test (on ne fait pas d’omelette sans casser des œufs) * Put Humpty Dumpty together again. * uppercase once Co-authored-by: Markus Scherer <[email protected]> * Don’t yell about non-medial hyphens nor the hyphen in U+1180. * s/ff/ild/g * isLetterOrDigit --------- Co-authored-by: Markus Scherer <[email protected]>
1 parent af1703a commit 1450868

File tree

5 files changed

+128
-17
lines changed

5 files changed

+128
-17
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import java.util.Map;
1515
import java.util.regex.Pattern;
1616
import org.unicode.cldr.util.MultiComparator;
17+
import org.unicode.props.IndexUnicodeProperties;
18+
import org.unicode.props.UcdProperty;
1719
import org.unicode.props.UcdPropertyValues;
1820
import org.unicode.props.UnicodeProperty;
1921
import org.unicode.props.UnicodeProperty.PatternMatcher;
@@ -340,6 +342,15 @@ private boolean applyPropertyAlias0(
340342
}
341343
}
342344
set = prop.getSet(propertyValue);
345+
if (set.isEmpty()
346+
&& prop instanceof IndexUnicodeProperties.IndexUnicodeProperty
347+
&& prop.getName().equals("Name")) {
348+
set =
349+
((IndexUnicodeProperties.IndexUnicodeProperty) prop)
350+
.getFactory()
351+
.getProperty(UcdProperty.Name_Alias)
352+
.getSet(propertyValue);
353+
}
343354
}
344355
} else if (isAge) {
345356
set = new UnicodeSet();

UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,53 @@ public void TestInteriorlyNegatedComparison() {
175175
"[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]");
176176
}
177177

178+
@Test
179+
public void TestNameMatching() {
180+
// UAX44-LM2 for both Name and Name_Alias.
181+
checkSetsEqual("\\p{Name=NO-BREAK SPACE}", "[\\xA0]");
182+
checkSetsEqual("\\p{Name=no break space}", "[\\xA0]");
183+
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-E}", "[\\u1180]");
184+
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]");
185+
checkSetsEqual("\\p{Name=Hangul jungseong o-e}", "[\\u1180]");
186+
checkSetsEqual("\\p{Name=Hangul jungseong oe}", "[\\u116C]");
187+
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -E}", "[\\u1180]");
188+
checkSetsEqual("\\p{Name= HANGUL JUNGSEONG O-E }", "[\\u1180]");
189+
checkSetsEqual("\\p{Name=_HANGUL_JUNGSEONG_O-E_}", "[\\u1180]");
190+
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-EO}", "[\\u117F]");
191+
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE O}", "[\\u117F]");
192+
checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -EO}", "[]");
193+
checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]");
194+
checkSetsEqual("\\p{Name=MARCHEN_LETTER_-A}", "[\\x{11C88}]");
195+
checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]");
196+
checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]");
197+
checkSetsEqual("\\p{Name=TIBETAN MARK TSA PHRU}", "[]");
198+
checkSetsEqual("\\p{Name=TIBETAN MARK BKA- SHOG YIG MGO}", "[\\u0F0A]");
199+
checkSetsEqual("\\p{Name=TIBETAN MARK BKA SHOG YIG MGO}", "[]");
200+
checkSetsEqual("\\p{Name_Alias=newline}", "[\\x0A]");
201+
checkSetsEqual("\\p{Name_Alias=NEW LINE}", "[\\x0A]");
202+
// The medial hyphen is only significant in HANGUL JUNGSEONG O-E, not in arbitrary O-E/OE.
203+
checkSetsEqual("\\p{Name=twoemdash}", "⸺");
204+
checkSetsEqual("\\p{Name=SeeNoEvil_Monkey}", "🙈");
205+
checkSetsEqual("\\p{Name=BALLET S-H-O-E-S}", "🩰");
206+
checkSetsEqual("[\\p{Name=LATIN SMALL LIGATURE O-E}uf]", "[œuf]");
207+
}
208+
209+
@Test
210+
public void TestNameAliases() {
211+
// Name_Alias values behave as aliases for Name, but not vice-versa.
212+
checkSetsEqual(
213+
"\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]");
214+
checkSetsEqual(
215+
"\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]");
216+
checkSetsEqual(
217+
"\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}",
218+
"[]");
219+
checkSetsEqual(
220+
"\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}",
221+
"[︘]");
222+
checkProperties("\\p{Name_Alias=@none@}", "[a-z]");
223+
}
224+
178225
@Test
179226
public void TestIdentityQuery() {
180227
checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}");

unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ public Map<UcdProperty, Long> getCacheFileSize() {
196196
static final Transform<String, String> fromNumericPinyin =
197197
Transliterator.getInstance("NumericPinyin-Latin;nfc");
198198

199+
static final Merge<String> MULTIVALUED_JOINER = new PropertyUtilities.Joiner("|");
199200
static final Merge<String> ALPHABETIC_JOINER =
200201
new Merge<String>() {
201202
TreeSet<String> sorted = new TreeSet<String>();
@@ -684,7 +685,7 @@ public VersionInfo getUcdVersion() {
684685
// .get(toSkeleton(propertyAlias));
685686
// }
686687

687-
class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
688+
public class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
688689

689690
private final UcdProperty prop;
690691
private final Map<String, PropertyNames> stringToNamedEnum;
@@ -724,6 +725,10 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
724725
}
725726
}
726727

728+
public IndexUnicodeProperties getFactory() {
729+
return IndexUnicodeProperties.this;
730+
}
731+
727732
@Override
728733
public boolean isTrivial() {
729734
return _getRawUnicodeMap().isEmpty()

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,7 @@ private static void parsePropertyValueFile(
884884
&& indexUnicodeProperties.ucdVersion.compareTo(
885885
VersionInfo.UNICODE_4_0)
886886
<= 0
887-
? new PropertyUtilities.Joiner("|")
887+
? IndexUnicodeProperties.MULTIVALUED_JOINER
888888
: null;
889889
final var originalMultivaluedSplit = propInfo.multivaluedSplit;
890890
// The first version of kPrimaryNumeric had spaces in values.
@@ -995,7 +995,7 @@ private static void parseNameAliasesFile(
995995
indexUnicodeProperties,
996996
nextProperties,
997997
propInfoSet,
998-
IndexUnicodeProperties.ALPHABETIC_JOINER,
998+
IndexUnicodeProperties.MULTIVALUED_JOINER,
999999
false);
10001000
}
10011001
}

unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,11 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
448448
? NULL_MATCHER
449449
: new SimpleMatcher(
450450
propertyValue,
451-
isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR),
451+
getName().equals("Name") || getName().equals("Name_Alias")
452+
? CHARACTER_NAME_COMPARATOR
453+
: isType(STRING_OR_MISC_MASK)
454+
? null
455+
: PROPERTY_COMPARATOR),
452456
result);
453457
}
454458
}
@@ -720,39 +724,83 @@ public static String toSkeleton(String source) {
720724
return skeletonBuffer.toString();
721725
}
722726

723-
/** Returns a representative of the equivalence class of source under UAX44-LM2. */
724-
public static String toNameSkeleton(String source) {
727+
public static final Comparator<String> CHARACTER_NAME_COMPARATOR =
728+
new Comparator<String>() {
729+
@Override
730+
public int compare(String o1, String o2) {
731+
return compareCharacterNames(o1, o2);
732+
}
733+
};
734+
735+
public static int compareCharacterNames(String a, String b) {
736+
if (a == b) return 0;
737+
if (a == null) return -1;
738+
if (b == null) return 1;
739+
return toNameSkeleton(a, false).compareTo(toNameSkeleton(b, false));
740+
}
741+
742+
/**
743+
* Returns a representative of the equivalence class of source under UAX44-LM2. If
744+
* validate=true, checks that source contains only characters allowed in character names.
745+
*/
746+
public static String toNameSkeleton(String source, boolean validate) {
725747
if (source == null) return null;
726-
StringBuffer result = new StringBuffer();
748+
StringBuilder result = new StringBuilder();
727749
// remove spaces, medial '-'
728750
// we can do this with char, since no surrogates are involved
729751
for (int i = 0; i < source.length(); ++i) {
730752
char ch = source.charAt(i);
753+
final char uppercase = Character.toUpperCase(ch);
754+
if (validate && uppercase != ch) {
755+
throw new IllegalArgumentException(
756+
"Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
757+
}
758+
ch = uppercase;
731759
if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') {
732760
result.append(ch);
733761
} else if (ch == ' ') {
734762
// don't copy ever
735763
} else if (ch == '-') {
736-
// only copy non-medials AND trailing O-E
737-
if (0 == i
738-
|| i == source.length() - 1
739-
|| source.charAt(i - 1) == ' '
740-
|| source.charAt(i + 1) == ' '
741-
|| (i == source.length() - 2
742-
&& source.charAt(i - 1) == 'O'
743-
&& source.charAt(i + 1) == 'E')) {
744-
System.out.println("****** EXCEPTION " + source);
764+
// Only copy a hyphen-minus if it is non-medial, or if it is
765+
// the hyphen in U+1180 HANGUL JUNGSEONG O-E.
766+
boolean medial;
767+
if (0 == i || i == source.length() - 1) {
768+
medial = false; // Name-initial or name-final.
769+
} else {
770+
medial =
771+
Character.isLetterOrDigit(source.charAt(i - 1))
772+
&& Character.isLetterOrDigit(source.charAt(i + 1));
773+
}
774+
boolean is1180 = false;
775+
if (medial
776+
&& i <= source.length() - 2
777+
&& Character.toUpperCase(source.charAt(i + 1)) == 'E'
778+
&& result.toString().equals("HANGULJUNGSEONGO")) {
779+
is1180 = true;
780+
for (int j = i + 2; j < source.length(); ++j) {
781+
if (source.charAt(j) != ' ' && source.charAt(j) != '_') {
782+
is1180 = false;
783+
}
784+
}
785+
}
786+
if (!medial || is1180) {
745787
result.append(ch);
746788
}
747789
// otherwise don't copy
748-
} else {
790+
} else if (validate) {
749791
throw new IllegalArgumentException(
750792
"Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
793+
} else if (ch != '_') {
794+
result.append(ch);
751795
}
752796
}
753797
return result.toString();
754798
}
755799

800+
public static String toNameSkeleton(String source) {
801+
return toNameSkeleton(source, true);
802+
}
803+
756804
/**
757805
* These routines use the Java functions, because they only need to act on ASCII Changes space,
758806
* - into _, inserts _ between lower and UPPER.

0 commit comments

Comments
 (0)