Skip to content

Commit 888a046

Browse files
authored
[Fix](regexp) make dot match newline in regexp_fn by default (#60831)
### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note doc: apache/doris-website#3410 Keep the behavior of `regexp_fn` consistent with `regexp` before: ```text Doris> SELECT REGEXP_EXTRACT('foo\nbar', '^(.+)$', 1); +-----------------------------------------+ | REGEXP_EXTRACT('foo\nbar', '^(.+)$', 1) | +-----------------------------------------+ | | +-----------------------------------------+ ``` After ```text Doris> SELECT REGEXP_EXTRACT('foo\nbar', '^(.+)$', 1); +-----------------------------------------+ | REGEXP_EXTRACT('foo\nbar', '^(.+)$', 1) | +-----------------------------------------+ | foo bar | +-----------------------------------------+ ``` None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [ ] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [ ] No. - [ ] Yes. <!-- Add document PR link here. eg: apache/doris-website#1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into -->
1 parent d7ae570 commit 888a046

File tree

3 files changed

+64
-0
lines changed

3 files changed

+64
-0
lines changed

be/src/vec/functions/function_regexp.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ struct RegexpExtractEngine {
6666
RegexpExtractEngine& engine, bool enable_extended_regex) {
6767
re2::RE2::Options options;
6868
options.set_log_errors(false); // avoid RE2 printing to stderr; we handle errors ourselves
69+
options.set_dot_nl(true); // make '.' match '\n' by default, consistent with REGEXP/LIKE
6970
engine.re2_regex =
7071
std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size), options);
7172

regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,21 @@ EdgeCase1
163163
-- !regexp_extract_3 --
164164
AA-1
165165

166+
-- !regexp_extract_4 --
167+
foo\nbar
168+
169+
-- !regexp_extract_5 --
170+
171+
172+
-- !regexp_extract_6 --
173+
foo\nbar
174+
175+
-- !regexp_extract_7 --
176+
aXb
177+
178+
-- !regexp_extract_8 --
179+
aXb
180+
166181
-- !sql --
167182
b
168183

@@ -178,6 +193,21 @@ d
178193
-- !regexp_extract_or_null_2 --
179194
B
180195

196+
-- !regexp_extract_or_null_3 --
197+
foo\nbar
198+
199+
-- !regexp_extract_or_null_4 --
200+
\N
201+
202+
-- !regexp_extract_or_null_5 --
203+
foo\nbar
204+
205+
-- !regexp_extract_or_null_6 --
206+
aXb
207+
208+
-- !regexp_extract_or_null_7 --
209+
aXb
210+
181211
-- !sql --
182212
['18','17']
183213

@@ -217,6 +247,21 @@ B
217247
-- !sql_regexp_extract_all_5 --
218248
['Case1','Case2','Case3']
219249

250+
-- !sql_regexp_extract_all_6 --
251+
['foo\nbar']
252+
253+
-- !sql_regexp_extract_all_7 --
254+
255+
256+
-- !sql_regexp_extract_all_8 --
257+
['foo\nbar']
258+
259+
-- !sql_regexp_extract_all_9 --
260+
['aXb','cXd']
261+
262+
-- !sql_regexp_extract_all_10 --
263+
['aXb','cXd']
264+
220265
-- !sql --
221266
a-b-c
222267

regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ suite("test_string_function_regexp") {
7676
qt_regexp_extract_3 'SELECT regexp_extract(\'ID:AA-1,ID:BB-2,ID:CC-3\', \'(?<=ID:)([A-Z]{2}-\\\\d)(?=,ID|$)\', 1);'
7777
sql "set enable_extended_regex = false;"
7878

79+
qt_regexp_extract_4 "SELECT REGEXP_EXTRACT(concat('foo', char(10), 'bar'), '(foo.bar)', 1);"
80+
qt_regexp_extract_5 "SELECT REGEXP_EXTRACT(concat('foo', char(10), 'bar'), '(?-s)(foo.bar)', 1);"
81+
qt_regexp_extract_6 "SELECT REGEXP_EXTRACT(concat('foo', char(10), 'bar'), '(?s)(foo.bar)', 1);"
82+
qt_regexp_extract_7 "SELECT REGEXP_EXTRACT(concat('aXb', char(10), 'cXd'), '(?-s)(a.b)', 1);"
83+
qt_regexp_extract_8 "SELECT REGEXP_EXTRACT(concat('aXb', char(10), 'cXd'), '(a.b)', 1);"
84+
7985
qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 1);"
8086
qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2);"
8187
qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 3);"
@@ -90,6 +96,12 @@ suite("test_string_function_regexp") {
9096
qt_regexp_extract_or_null_2 "SELECT regexp_extract_or_null('TokenA TokenB TokenC', '(?<=Token)([A-Z])(?= TokenC)', 1);"
9197
sql "set enable_extended_regex = false;"
9298

99+
qt_regexp_extract_or_null_3 "SELECT REGEXP_EXTRACT_OR_NULL(concat('foo', char(10), 'bar'), '(foo.bar)', 1);"
100+
qt_regexp_extract_or_null_4 "SELECT REGEXP_EXTRACT_OR_NULL(concat('foo', char(10), 'bar'), '(?-s)(foo.bar)', 1);"
101+
qt_regexp_extract_or_null_5 "SELECT REGEXP_EXTRACT_OR_NULL(concat('foo', char(10), 'bar'), '(?s)(foo.bar)', 1);"
102+
qt_regexp_extract_or_null_6 "SELECT REGEXP_EXTRACT_OR_NULL(concat('aXb', char(10), 'cXd'), '(?-s)(a.b)', 1);"
103+
qt_regexp_extract_or_null_7 "SELECT REGEXP_EXTRACT_OR_NULL(concat('aXb', char(10), 'cXd'), '(a.b)', 1);"
104+
93105
qt_sql "SELECT regexp_extract_all('x=a3&x=18abc&x=2&y=3&x=4&x=17bcd', 'x=([0-9]+)([a-z]+)');"
94106
qt_sql "SELECT regexp_extract_all('http://a.m.baidu.com/i41915i73660.htm', 'i([0-9]+)');"
95107
qt_sql "SELECT regexp_extract_all('abc=111, def=222, ghi=333', '(\"[^\"]+\"|\\\\w+)=(\"[^\"]+\"|\\\\w+)');"
@@ -110,6 +122,12 @@ suite("test_string_function_regexp") {
110122
qt_sql_regexp_extract_all_5 'SELECT REGEXP_EXTRACT_ALL(\'EdgeCase1EdgeCase2EdgeCase3\', \'(?<=Edge)(Case\\\\d)(?=Edge|$)\');'
111123
sql "set enable_extended_regex = false;"
112124

125+
qt_sql_regexp_extract_all_6 "SELECT REGEXP_EXTRACT_ALL(concat('foo', char(10), 'bar'), '(foo.bar)');"
126+
qt_sql_regexp_extract_all_7 "SELECT REGEXP_EXTRACT_ALL(concat('foo', char(10), 'bar'), '(?-s)(foo.bar)');"
127+
qt_sql_regexp_extract_all_8 "SELECT REGEXP_EXTRACT_ALL(concat('foo', char(10), 'bar'), '(?s)(foo.bar)');"
128+
qt_sql_regexp_extract_all_9 "SELECT REGEXP_EXTRACT_ALL(concat('aXb', char(10), 'cXd'), '(?-s)(\\\\w.\\\\w)');"
129+
qt_sql_regexp_extract_all_10 "SELECT REGEXP_EXTRACT_ALL(concat('aXb', char(10), 'cXd'), '(\\\\w.\\\\w)');"
130+
113131
qt_sql "SELECT regexp_replace('a b c', \" \", \"-\");"
114132
qt_sql "SELECT regexp_replace('a b c','(b)','<\\\\1>');"
115133

0 commit comments

Comments
 (0)