Skip to content

Commit 197be68

Browse files
aheningerFrankYFTang
authored andcommitted
ICU-23143 Fix regex bug with unpaired surrogates
Fix a regex bug that occured when a pattern contained an unpaired leading surrogate. It could, incorrectly, match half of a valid supplementary char, leaving an invalid match position, and leading to subsequent assertion failures.
1 parent 522ed98 commit 197be68

File tree

3 files changed

+37
-0
lines changed

3 files changed

+37
-0
lines changed

icu4c/source/i18n/rematch.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4420,6 +4420,14 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
44204420
}
44214421
}
44224422

4423+
// If the pattern string ends with an unpaired lead surrogate that
4424+
// matched the lead surrogate of a valid pair in the input text,
4425+
// this does not count as a match.
4426+
if (success && U16_IS_LEAD(*(pInp-1)) &&
4427+
pInp < pInpLimit && U16_IS_TRAIL(*(pInp))) {
4428+
success = false;
4429+
}
4430+
44234431
if (success) {
44244432
fp->fInputIdx += stringLen;
44254433
} else {

icu4c/source/test/intltest/regextst.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <stdlib.h>
2929
#include <stdio.h>
3030
#include <string.h>
31+
#include <iterator>
3132

3233
#include "unicode/localpointer.h"
3334
#include "unicode/regex.h"
@@ -107,6 +108,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
107108
TESTCASE_AUTO(TestBug13632);
108109
TESTCASE_AUTO(TestBug20359);
109110
TESTCASE_AUTO(TestBug20863);
111+
TESTCASE_AUTO(TestBug23143);
110112
TESTCASE_AUTO_END;
111113
}
112114

@@ -5815,6 +5817,32 @@ void RegexTest::TestBug20359() {
58155817
}
58165818

58175819

5820+
void RegexTest::TestBug23143() {
5821+
// Test pattern with unpaired surrogate matching against text
5822+
// with a valid surrogate pair. Originally caused an assertion failure
5823+
// in the implementation.
5824+
5825+
// Note: can't use normal C++ string literals because unpaired surrogates are illegal in them.
5826+
const char16_t regex_array[] = {u'a', 0xD805, u'.', u'*', u'b'};
5827+
UnicodeString regex(regex_array, std::size(regex_array));
5828+
5829+
const char16_t haystack_array[] = {u'a', 0xD805, 0xDF20};
5830+
UnicodeString haystack(haystack_array, std::size(haystack_array));
5831+
5832+
UErrorCode status = U_ZERO_ERROR;
5833+
std::unique_ptr<icu::RegexPattern> re(icu::RegexPattern::compile(regex, 0, status));
5834+
if (!assertSuccess(WHERE, status)) {
5835+
return;
5836+
}
5837+
// re->dumpPattern();
5838+
std::unique_ptr<icu::RegexMatcher> regex_matcher(re->matcher(haystack, status));
5839+
if (!assertSuccess(WHERE, status)) {
5840+
return;
5841+
}
5842+
assertFalse(WHERE, regex_matcher->find(0, status));
5843+
assertSuccess(WHERE, status);
5844+
}
5845+
58185846
void RegexTest::TestBug20863() {
58195847
// Test that patterns with a large number of named capture groups work correctly.
58205848
//

icu4c/source/test/intltest/regextst.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class RegexTest: public IntlTest {
6161
virtual void TestBug13632();
6262
virtual void TestBug20359();
6363
virtual void TestBug20863();
64+
virtual void TestBug23143();
6465

6566
// The following functions are internal to the regexp tests.
6667
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);

0 commit comments

Comments
 (0)