Skip to content

Commit bda99ab

Browse files
committed
ICU-23313 UnicodeSet: binary query negation with NOT EQUAL TO
1 parent ac5f1bb commit bda99ab

File tree

2 files changed

+41
-2
lines changed

2 files changed

+41
-2
lines changed

icu4c/source/common/uniset_props.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,7 @@ class UnicodeSet::Lexer {
693693
std::optional<int32_t> queryOperatorPosition;
694694
int32_t queryExpressionStart = parsePosition_.getIndex();
695695
bool exteriorlyNegated = false;
696+
bool interiorlyNegated = false;
696697
UBool unusedEscaped;
697698
// Do not skip whitespace so we can recognize unspaced :]. Lex escapes and
698699
// named-element: while ICU does not support string-valued properties and thus has no
@@ -742,7 +743,14 @@ class UnicodeSet::Lexer {
742743
// Neither a named-element nor an escaped-element can be part of a closing :].
743744
lastUnescaped = -1;
744745
} else if (!queryOperatorPosition.has_value() && lastUnescaped == u'=') {
745-
// TODO(egg): Propose and add support for ≠.
746+
queryOperatorPosition = parsePosition_.getIndex() - 1;
747+
} else if (!queryOperatorPosition.has_value() && lastUnescaped == u'') {
748+
if (exteriorlyNegated) {
749+
// Reject doubly negated property queries.
750+
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
751+
return {};
752+
}
753+
interiorlyNegated = true;
746754
queryOperatorPosition = parsePosition_.getIndex() - 1;
747755
} else if ((first == u'[' && penultimateUnescaped == u':' && lastUnescaped == u']') ||
748756
(first == u'\\' && lastUnescaped == u'}')) {
@@ -772,7 +780,7 @@ class UnicodeSet::Lexer {
772780
pattern_.tempSubStringBetween(queryExpressionStart,
773781
queryOperatorPosition.value_or(queryExpressionLimit)),
774782
propertyPredicate, errorCode);
775-
if (exteriorlyNegated) {
783+
if (exteriorlyNegated != interiorlyNegated) {
776784
result.complement().removeAllStrings();
777785
}
778786
result.setPattern(pattern_.tempSubStringBetween(queryStart, parsePosition_.getIndex()));

icu4c/source/test/intltest/usettest.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,34 @@ void UnicodeSetTest::TestPropertySet() {
11181118
expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
11191119
CharsToUnicodeString(DATA[i+2]));
11201120
}
1121+
{
1122+
UErrorCode status = U_ZERO_ERROR;
1123+
UnicodeSet s1(u"[:Noncharacter_Code_Point≠No:]", status);
1124+
UnicodeSet s2(u"[:Noncharacter_Code_Point:]", status);
1125+
TEST_ASSERT_SUCCESS(status);
1126+
TEST_ASSERT(s1 == s2);
1127+
}
1128+
{
1129+
UErrorCode status = U_ZERO_ERROR;
1130+
UnicodeSet s1(uR"(\p{Noncharacter_Code_Point≠No})", status);
1131+
UnicodeSet s2(uR"(\p{Noncharacter_Code_Point})", status);
1132+
TEST_ASSERT_SUCCESS(status);
1133+
TEST_ASSERT(s1 == s2);
1134+
}
1135+
{
1136+
UErrorCode status = U_ZERO_ERROR;
1137+
UnicodeSet s1(uR"(\p{dt≠can})", status);
1138+
UnicodeSet s2(uR"(\P{dt=can})", status);
1139+
TEST_ASSERT_SUCCESS(status);
1140+
TEST_ASSERT(s1 == s2);
1141+
}
1142+
{
1143+
UErrorCode status = U_ZERO_ERROR;
1144+
UnicodeSet s1(uR"([:dt≠can:])", status);
1145+
UnicodeSet s2(uR"([:^dt=can:])", status);
1146+
TEST_ASSERT_SUCCESS(status);
1147+
TEST_ASSERT(s1 == s2);
1148+
}
11211149
}
11221150

11231151
/**
@@ -4778,6 +4806,9 @@ void UnicodeSetTest::TestParseErrors() {
47784806
uR"(\p{Uppercase_Letter=})",
47794807
// Well-formed in ICU 78 and earlier, disallowed by ICU-23306.
47804808
uR"([: ^general category = punctuation :])",
4809+
// Doubly negated property queries.
4810+
uR"(\P{Decomposition_Type≠compat})",
4811+
u"[:^Noncharacter_Code_Point≠No:]",
47814812
}) {
47824813
UErrorCode errorCode = U_ZERO_ERROR;
47834814
const UnicodeSet set(expression, errorCode);

0 commit comments

Comments
 (0)