Skip to content

Commit 4b599d9

Browse files
fix escape issue in test
1 parent 5de2413 commit 4b599d9

File tree

2 files changed

+323
-311
lines changed

2 files changed

+323
-311
lines changed
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.comet
21+
22+
import org.apache.spark.sql.CometTestBase
23+
24+
class CometRegexpExpressionSuite extends CometTestBase {
25+
26+
test("regexp_extract basic") {
27+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
28+
val data = Seq(
29+
("100-200", 1),
30+
("300-400", 1),
31+
(null, 1), // NULL input
32+
("no-match", 1), // no match → should return ""
33+
("abc123def456", 1),
34+
("", 1) // empty string
35+
)
36+
37+
withParquetTable(data, "tbl") {
38+
// Test basic extraction: group 0 (full match)
39+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '(\\\\d+)-(\\\\d+)', 0) FROM tbl")
40+
// Test group 1
41+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '(\\\\d+)-(\\\\d+)', 1) FROM tbl")
42+
// Test group 2
43+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '(\\\\d+)-(\\\\d+)', 2) FROM tbl")
44+
// Test empty pattern
45+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '', 0) FROM tbl")
46+
// Test null pattern
47+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, NULL, 0) FROM tbl")
48+
}
49+
}
50+
}
51+
52+
test("regexp_extract edge cases") {
53+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
54+
val data =
55+
Seq(("[email protected]", 1), ("phone: 123-456-7890", 1), ("price: $99.99", 1), (null, 1))
56+
57+
withParquetTable(data, "tbl") {
58+
// Extract email domain
59+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '@([^.]+)', 1) FROM tbl")
60+
// Extract phone number
61+
checkSparkAnswerAndOperator(
62+
"SELECT regexp_extract(_1, '(\\\\d{3}-\\\\d{3}-\\\\d{4})', 1) FROM tbl")
63+
// Extract price
64+
checkSparkAnswerAndOperator(
65+
"SELECT regexp_extract(_1, '\\\\$(\\\\d+\\\\.\\\\d+)', 1) FROM tbl")
66+
}
67+
}
68+
}
69+
70+
test("regexp_extract_all basic") {
71+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
72+
val data = Seq(
73+
("a1b2c3", 1),
74+
("test123test456", 1),
75+
(null, 1), // NULL input
76+
("no digits", 1), // no match → should return []
77+
("", 1) // empty string
78+
)
79+
80+
withParquetTable(data, "tbl") {
81+
// Test with explicit group 0 (full match on no-group pattern)
82+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '\\\\d+', 0) FROM tbl")
83+
// Test with explicit group 0
84+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\\\d+)', 0) FROM tbl")
85+
// Test group 1
86+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\\\d+)', 1) FROM tbl")
87+
// Test empty pattern
88+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '', 0) FROM tbl")
89+
// Test null pattern
90+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, NULL, 0) FROM tbl")
91+
}
92+
}
93+
}
94+
95+
test("regexp_extract_all multiple matches") {
96+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
97+
val data = Seq(
98+
("The prices are $10, $20, and $30", 1),
99+
("colors: red, green, blue", 1),
100+
("words: hello world", 1),
101+
(null, 1))
102+
103+
withParquetTable(data, "tbl") {
104+
// Extract all prices
105+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '\\\\$(\\\\d+)', 1) FROM tbl")
106+
// Extract all words
107+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '([a-z]+)', 1) FROM tbl")
108+
}
109+
}
110+
}
111+
112+
test("regexp_extract_all with dictionary encoding") {
113+
withSQLConf(
114+
CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true",
115+
"parquet.enable.dictionary" -> "true") {
116+
// Use repeated values to trigger dictionary encoding
117+
// Mix short strings, long strings, and various patterns
118+
val longString1 = "prefix" + ("abc" * 100) + "123" + ("xyz" * 100) + "456"
119+
val longString2 = "start" + ("test" * 200) + "789" + ("end" * 150)
120+
121+
val data = (0 until 2000).map(i => {
122+
val text = i % 7 match {
123+
case 0 => "a1b2c3" // Simple repeated pattern
124+
case 1 => "x5y6" // Another simple pattern
125+
case 2 => "no-match" // No digits
126+
case 3 => longString1 // Long string with digits
127+
case 4 => longString2 // Another long string
128+
case 5 => "[email protected]:123-456-7890" // Complex pattern
129+
case 6 => "" // Empty string
130+
}
131+
(text, 1)
132+
})
133+
134+
withParquetTable(data, "tbl") {
135+
// Test simple pattern
136+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\\\d+)') FROM tbl")
137+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\\\d+)', 0) FROM tbl")
138+
139+
// Test complex patterns
140+
checkSparkAnswerAndOperator(
141+
"SELECT regexp_extract_all(_1, '(\\\\d{3}-\\\\d{3}-\\\\d{4})', 0) FROM tbl")
142+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '@([a-z]*)', 1) FROM tbl")
143+
144+
// Test with multiple groups
145+
checkSparkAnswerAndOperator(
146+
"SELECT regexp_extract_all(_1, '([a-z])(\\\\d*)', 1) FROM tbl")
147+
}
148+
}
149+
}
150+
151+
test("regexp_extract with dictionary encoding") {
152+
withSQLConf(
153+
CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true",
154+
"parquet.enable.dictionary" -> "true") {
155+
// Use repeated values to trigger dictionary encoding
156+
// Mix short and long strings with various patterns
157+
val longString1 = "data" + ("x" * 500) + "999" + ("y" * 500)
158+
val longString2 = ("a" * 1000) + "777" + ("b" * 1000)
159+
160+
val data = (0 until 2000).map(i => {
161+
val text = i % 7 match {
162+
case 0 => "a1b2c3"
163+
case 1 => "x5y6"
164+
case 2 => "no-match"
165+
case 3 => longString1
166+
case 4 => longString2
167+
case 5 => "IP:192.168.1.100-PORT:8080"
168+
case 6 => ""
169+
}
170+
(text, 1)
171+
})
172+
173+
withParquetTable(data, "tbl") {
174+
// Test extracting first match with simple pattern
175+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '(\\\\d+)', 1) FROM tbl")
176+
177+
// Test with complex patterns
178+
checkSparkAnswerAndOperator(
179+
"SELECT regexp_extract(_1, '(\\\\d+)\\\\.(\\\\d+)\\\\.(\\\\d+)\\\\.(\\\\d+)', 1) FROM tbl")
180+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, 'PORT:(\\\\d+)', 1) FROM tbl")
181+
182+
// Test with multiple groups - extract second group
183+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '([a-z])(\\\\d+)', 2) FROM tbl")
184+
}
185+
}
186+
}
187+
188+
test("regexp_extract unicode and special characters") {
189+
import org.apache.comet.CometConf
190+
191+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
192+
val data = Seq(
193+
("测试123test", 1), // Chinese characters
194+
("日本語456にほんご", 1), // Japanese characters
195+
("한글789Korean", 1), // Korean characters
196+
("Привет999Hello", 1), // Cyrillic
197+
("line1\nline2", 1), // Newline
198+
("tab\there", 1), // Tab
199+
("special: $#@!%^&*", 1), // Special chars
200+
("mixed测试123test日本語", 1), // Mixed unicode
201+
(null, 1))
202+
203+
withParquetTable(data, "tbl") {
204+
// Extract digits from unicode text
205+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '(\\\\d+)', 1) FROM tbl")
206+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\\\d+)', 1) FROM tbl")
207+
208+
// Test word boundaries with unicode
209+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '([a-zA-Z]+)', 1) FROM tbl")
210+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '([a-zA-Z]+)', 1) FROM tbl")
211+
}
212+
}
213+
}
214+
215+
test("regexp_extract_all multiple groups") {
216+
import org.apache.comet.CometConf
217+
218+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
219+
val data = Seq(
220+
("a1b2c3", 1),
221+
("x5y6z7", 1),
222+
("test123demo456end789", 1),
223+
(null, 1),
224+
("no match here", 1))
225+
226+
withParquetTable(data, "tbl") {
227+
// Test extracting different groups - full match
228+
checkSparkAnswerAndOperator(
229+
"SELECT regexp_extract_all(_1, '([a-z])(\\\\d+)', 0) FROM tbl")
230+
// Test extracting group 1 (letters)
231+
checkSparkAnswerAndOperator(
232+
"SELECT regexp_extract_all(_1, '([a-z])(\\\\d+)', 1) FROM tbl")
233+
// Test extracting group 2 (digits)
234+
checkSparkAnswerAndOperator(
235+
"SELECT regexp_extract_all(_1, '([a-z])(\\\\d+)', 2) FROM tbl")
236+
237+
// Test with three groups
238+
checkSparkAnswerAndOperator(
239+
"SELECT regexp_extract_all(_1, '([a-z]+)(\\\\d+)([a-z]+)', 1) FROM tbl")
240+
checkSparkAnswerAndOperator(
241+
"SELECT regexp_extract_all(_1, '([a-z]+)(\\\\d+)([a-z]+)', 2) FROM tbl")
242+
checkSparkAnswerAndOperator(
243+
"SELECT regexp_extract_all(_1, '([a-z]+)(\\\\d+)([a-z]+)', 3) FROM tbl")
244+
}
245+
}
246+
}
247+
248+
test("regexp_extract complex patterns") {
249+
import org.apache.comet.CometConf
250+
251+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
252+
val data = Seq(
253+
("2024-01-15", 1), // Date
254+
("192.168.1.1", 1), // IP address
255+
("[email protected]", 1), // Complex email
256+
("<tag>content</tag>", 1), // HTML-like
257+
("Time: 14:30:45.123", 1), // Timestamp
258+
("Version: 1.2.3-beta", 1), // Version string
259+
("RGB(255,128,0)", 1), // RGB color
260+
(null, 1))
261+
262+
withParquetTable(data, "tbl") {
263+
// Extract year from date
264+
checkSparkAnswerAndOperator(
265+
"SELECT regexp_extract(_1, '(\\\\d{4})-(\\\\d{2})-(\\\\d{2})', 1) FROM tbl")
266+
267+
// Extract month from date
268+
checkSparkAnswerAndOperator(
269+
"SELECT regexp_extract(_1, '(\\\\d{4})-(\\\\d{2})-(\\\\d{2})', 2) FROM tbl")
270+
271+
// Extract IP octets
272+
checkSparkAnswerAndOperator(
273+
"SELECT regexp_extract(_1, '(\\\\d+)\\\\.(\\\\d+)\\\\.(\\\\d+)\\\\.(\\\\d+)', 2) FROM tbl")
274+
275+
// Extract email domain
276+
checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '@([a-z.]+)', 1) FROM tbl")
277+
278+
// Extract time components
279+
checkSparkAnswerAndOperator(
280+
"SELECT regexp_extract(_1, '(\\\\d{2}):(\\\\d{2}):(\\\\d{2})', 1) FROM tbl")
281+
282+
// Extract RGB values
283+
checkSparkAnswerAndOperator(
284+
"SELECT regexp_extract(_1, 'RGB\\\\((\\\\d+),(\\\\d+),(\\\\d+)\\\\)', 2) FROM tbl")
285+
286+
// Test regexp_extract_all with complex patterns
287+
checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\\\d+)', 1) FROM tbl")
288+
}
289+
}
290+
}
291+
292+
test("regexp_extract vs regexp_extract_all comparison") {
293+
import org.apache.comet.CometConf
294+
295+
withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
296+
val data = Seq(("a1b2c3", 1), ("x5y6", 1), (null, 1), ("no digits", 1), ("single7match", 1))
297+
298+
withParquetTable(data, "tbl") {
299+
// Compare single extraction vs all extractions in one query
300+
checkSparkAnswerAndOperator("""SELECT
301+
| regexp_extract(_1, '(\\\\d+)', 1) as first_match,
302+
| regexp_extract_all(_1, '(\\\\d+)', 1) as all_matches
303+
|FROM tbl""".stripMargin)
304+
305+
// Verify regexp_extract returns first match only while regexp_extract_all returns all
306+
checkSparkAnswerAndOperator("""SELECT
307+
| _1,
308+
| regexp_extract(_1, '(\\\\d+)', 1) as first_digit,
309+
| regexp_extract_all(_1, '(\\\\d+)', 1) as all_digits
310+
|FROM tbl""".stripMargin)
311+
312+
// Test with multiple groups
313+
checkSparkAnswerAndOperator("""SELECT
314+
| regexp_extract(_1, '([a-z])(\\\\d+)', 1) as first_letter,
315+
| regexp_extract_all(_1, '([a-z])(\\\\d+)', 1) as all_letters,
316+
| regexp_extract(_1, '([a-z])(\\\\d+)', 2) as first_digit,
317+
| regexp_extract_all(_1, '([a-z])(\\\\d+)', 2) as all_digits
318+
|FROM tbl""".stripMargin)
319+
}
320+
}
321+
}
322+
323+
}

0 commit comments

Comments
 (0)