Skip to content

Commit 0b7e86c

Browse files
RyanL1997Swiddis
andauthored
[Enhancement] Error handling for illegal character usage in java regex named capture group (#4434)
Co-authored-by: Simeon Widdis <[email protected]>
1 parent 9c97cfb commit 0b7e86c

File tree

6 files changed

+285
-24
lines changed

6 files changed

+285
-24
lines changed

core/src/main/java/org/opensearch/sql/expression/parse/RegexCommonUtils.java

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ public class RegexCommonUtils {
2222
private static final Pattern NAMED_GROUP_PATTERN =
2323
Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");
2424

25+
// Pattern to extract ANY named group (valid or invalid) for validation
26+
private static final Pattern ANY_NAMED_GROUP_PATTERN = Pattern.compile("\\(\\?<([^>]+)>");
27+
2528
private static final int MAX_CACHE_SIZE = 1000;
2629

2730
private static final Map<String, Pattern> patternCache =
@@ -50,20 +53,52 @@ public static Pattern getCompiledPattern(String regex) {
5053
}
5154

5255
/**
53-
* Extract list of named group candidates from a regex pattern.
56+
* Extract list of named group candidates from a regex pattern. Validates that all group names
57+
* conform to Java regex named group requirements: must start with a letter and contain only
58+
* letters and digits.
5459
*
5560
* @param pattern The regex pattern string
56-
* @return List of named group names found in the pattern
61+
* @return List of valid named group names found in the pattern
62+
* @throws IllegalArgumentException if any named groups contain invalid characters
5763
*/
5864
public static List<String> getNamedGroupCandidates(String pattern) {
5965
ImmutableList.Builder<String> namedGroups = ImmutableList.builder();
60-
Matcher m = NAMED_GROUP_PATTERN.matcher(pattern);
61-
while (m.find()) {
62-
namedGroups.add(m.group(1));
66+
67+
Matcher anyGroupMatcher = ANY_NAMED_GROUP_PATTERN.matcher(pattern);
68+
while (anyGroupMatcher.find()) {
69+
String groupName = anyGroupMatcher.group(1);
70+
71+
if (!isValidJavaRegexGroupName(groupName)) {
72+
throw new IllegalArgumentException(
73+
String.format(
74+
"Invalid capture group name '%s'. Java regex group names must start with a letter"
75+
+ " and contain only letters and digits.",
76+
groupName));
77+
}
6378
}
79+
80+
Matcher validGroupMatcher = NAMED_GROUP_PATTERN.matcher(pattern);
81+
while (validGroupMatcher.find()) {
82+
namedGroups.add(validGroupMatcher.group(1));
83+
}
84+
6485
return namedGroups.build();
6586
}
6687

88+
/**
89+
* Validates if a group name conforms to Java regex named group requirements. Java regex group
90+
* names must: - Start with a letter (a-z, A-Z) - Contain only letters (a-z, A-Z) and digits (0-9)
91+
*
92+
* @param groupName The group name to validate
93+
* @return true if valid, false otherwise
94+
*/
95+
private static boolean isValidJavaRegexGroupName(String groupName) {
96+
if (groupName == null || groupName.isEmpty()) {
97+
return false;
98+
}
99+
return groupName.matches("^[A-Za-z][A-Za-z0-9]*$");
100+
}
101+
67102
/**
68103
* Match using find() for partial match semantics with string pattern.
69104
*

core/src/test/java/org/opensearch/sql/expression/parse/RegexCommonUtilsTest.java

Lines changed: 105 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,15 @@ public void testGetNamedGroupCandidatesWithNumericNames() {
194194
}
195195

196196
@Test
197-
public void testGetNamedGroupCandidatesSpecialCharacters() {
198-
// Test that groups with special characters are not captured (only alphanumeric starting with
199-
// letter)
200-
String pattern = "(?<valid_group>[a-z]+)\\s+(?<123invalid>[0-9]+)\\s+(?<also-invalid>.*)";
201-
List<String> groups = RegexCommonUtils.getNamedGroupCandidates(pattern);
202-
assertEquals(0, groups.size());
197+
public void testGetNamedGroupCandidatesWithInvalidCharactersThrowsException() {
198+
// Test that groups with invalid characters throw exception (even if some are valid)
199+
String pattern = "(?<validgroup>[a-z]+)\\s+(?<123invalid>[0-9]+)\\s+(?<also-invalid>.*)";
200+
IllegalArgumentException exception =
201+
assertThrows(
202+
IllegalArgumentException.class,
203+
() -> RegexCommonUtils.getNamedGroupCandidates(pattern));
204+
// Should fail on the first invalid group name found
205+
assertTrue(exception.getMessage().contains("Invalid capture group name"));
203206
}
204207

205208
@Test
@@ -211,4 +214,100 @@ public void testGetNamedGroupCandidatesValidAlphanumeric() {
211214
assertEquals("groupA", groups.get(0));
212215
assertEquals("group2B", groups.get(1));
213216
}
217+
218+
@Test
219+
public void testGetNamedGroupCandidatesWithUnderscore() {
220+
// Test that underscores in named groups throw IllegalArgumentException
221+
String patternWithUnderscore = ".+@(?<domain_name>.+)";
222+
IllegalArgumentException exception =
223+
assertThrows(
224+
IllegalArgumentException.class,
225+
() -> RegexCommonUtils.getNamedGroupCandidates(patternWithUnderscore));
226+
assertTrue(exception.getMessage().contains("Invalid capture group name 'domain_name'"));
227+
assertTrue(
228+
exception
229+
.getMessage()
230+
.contains("must start with a letter and contain only letters and digits"));
231+
}
232+
233+
@Test
234+
public void testGetNamedGroupCandidatesWithHyphen() {
235+
// Test that hyphens in named groups throw IllegalArgumentException
236+
String patternWithHyphen = ".+@(?<domain-name>.+)";
237+
IllegalArgumentException exception =
238+
assertThrows(
239+
IllegalArgumentException.class,
240+
() -> RegexCommonUtils.getNamedGroupCandidates(patternWithHyphen));
241+
assertTrue(exception.getMessage().contains("Invalid capture group name 'domain-name'"));
242+
assertTrue(
243+
exception
244+
.getMessage()
245+
.contains("must start with a letter and contain only letters and digits"));
246+
}
247+
248+
@Test
249+
public void testGetNamedGroupCandidatesWithDot() {
250+
// Test that dots in named groups throw IllegalArgumentException
251+
String patternWithDot = ".+@(?<domain.name>.+)";
252+
IllegalArgumentException exception =
253+
assertThrows(
254+
IllegalArgumentException.class,
255+
() -> RegexCommonUtils.getNamedGroupCandidates(patternWithDot));
256+
assertTrue(exception.getMessage().contains("Invalid capture group name 'domain.name'"));
257+
}
258+
259+
@Test
260+
public void testGetNamedGroupCandidatesWithSpace() {
261+
// Test that spaces in named groups throw IllegalArgumentException
262+
String patternWithSpace = ".+@(?<domain name>.+)";
263+
IllegalArgumentException exception =
264+
assertThrows(
265+
IllegalArgumentException.class,
266+
() -> RegexCommonUtils.getNamedGroupCandidates(patternWithSpace));
267+
assertTrue(exception.getMessage().contains("Invalid capture group name 'domain name'"));
268+
}
269+
270+
@Test
271+
public void testGetNamedGroupCandidatesStartingWithDigit() {
272+
// Test that group names starting with digit throw IllegalArgumentException
273+
String patternStartingWithDigit = ".+@(?<1domain>.+)";
274+
IllegalArgumentException exception =
275+
assertThrows(
276+
IllegalArgumentException.class,
277+
() -> RegexCommonUtils.getNamedGroupCandidates(patternStartingWithDigit));
278+
assertTrue(exception.getMessage().contains("Invalid capture group name '1domain'"));
279+
}
280+
281+
@Test
282+
public void testGetNamedGroupCandidatesWithSpecialCharacters() {
283+
// Test that special characters in named groups throw IllegalArgumentException
284+
String patternWithSpecialChar = ".+@(?<domain@name>.+)";
285+
IllegalArgumentException exception =
286+
assertThrows(
287+
IllegalArgumentException.class,
288+
() -> RegexCommonUtils.getNamedGroupCandidates(patternWithSpecialChar));
289+
assertTrue(exception.getMessage().contains("Invalid capture group name 'domain@name'"));
290+
}
291+
292+
@Test
293+
public void testGetNamedGroupCandidatesWithValidCamelCase() {
294+
// Test that valid camelCase group names work correctly
295+
String validPattern = "(?<userName>\\w+)@(?<domainName>\\w+)";
296+
List<String> groups = RegexCommonUtils.getNamedGroupCandidates(validPattern);
297+
assertEquals(2, groups.size());
298+
assertEquals("userName", groups.get(0));
299+
assertEquals("domainName", groups.get(1));
300+
}
301+
302+
@Test
303+
public void testGetNamedGroupCandidatesWithMixedInvalidValid() {
304+
// Test that even one invalid group name fails the entire validation
305+
String patternWithMixed =
306+
"(?<validName>[a-z]+)\\s+(?<invalid_name>[0-9]+)\\s+(?<anotherValidName>.*)";
307+
IllegalArgumentException exception =
308+
assertThrows(
309+
IllegalArgumentException.class,
310+
() -> RegexCommonUtils.getNamedGroupCandidates(patternWithMixed));
311+
assertTrue(exception.getMessage().contains("Invalid capture group name 'invalid_name'"));
312+
}
214313
}

docs/user/ppl/cmd/parse.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,7 @@ There are a few limitations with parse command:
114114
For example, the following query will not display the parsed fields ``host`` unless the source field ``email`` is also explicitly included::
115115

116116
source=accounts | parse email '.+@(?<host>.+)' | fields email, host ;
117+
118+
- Named capture group must start with a letter and contain only letters and digits.
119+
120+
For detailed Java regex pattern syntax and usage, refer to the `official Java Pattern documentation <https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`_

docs/user/ppl/cmd/rex.rst

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ Demonstrates naming restrictions for capture groups. Group names cannot contain
166166
Invalid PPL query with underscores::
167167

168168
os> source=accounts | rex field=email "(?<user_name>[^@]+)@(?<email_domain>[^.]+)" | fields email, user_name, email_domain ;
169-
{'reason': 'Invalid Query', 'details': 'Rex pattern must contain at least one named capture group', 'type': 'IllegalArgumentException'}
169+
{'reason': 'Invalid Query', 'details': "Invalid capture group name 'user_name'. Java regex group names must start with a letter and contain only letters and digits.", 'type': 'IllegalArgumentException'}
170170
Error: Query returned no data
171171

172172
Correct PPL query without underscores::
@@ -206,17 +206,17 @@ PPL query exceeding the configured limit results in an error::
206206
Comparison with Related Commands
207207
================================
208208

209-
============================= ============ ============
210-
Feature rex parse
211-
============================= ============ ============
212-
Pattern Type Java Regex Java Regex
213-
Named Groups Required Yes Yes
214-
Multiple Named Groups Yes No
215-
Multiple Matches Yes No
216-
Text Substitution Yes No
217-
Offset Tracking Yes No
218-
Underscores in Group Names No No
219-
============================= ============ ============
209+
================================== ============ ============
210+
Feature rex parse
211+
================================== ============ ============
212+
Pattern Type Java Regex Java Regex
213+
Named Groups Required Yes Yes
214+
Multiple Named Groups Yes No
215+
Multiple Matches Yes No
216+
Text Substitution Yes No
217+
Offset Tracking Yes No
218+
Special Characters in Group Names No No
219+
================================== ============ ============
220220

221221

222222
Limitations
@@ -226,7 +226,6 @@ There are several important limitations with the rex command:
226226

227227
**Named Capture Group Naming:**
228228

229-
- Named capture groups cannot contain underscores due to Java regex limitations
230229
- Group names must start with a letter and contain only letters and digits
231230
- For detailed Java regex pattern syntax and usage, refer to the `official Java Pattern documentation <https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`_
232231

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteParseCommandIT.java

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55

66
package org.opensearch.sql.calcite.remote;
77

8+
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_BANK;
9+
10+
import java.io.IOException;
11+
import org.junit.Test;
812
import org.opensearch.sql.ppl.ParseCommandIT;
913

1014
public class CalciteParseCommandIT extends ParseCommandIT {
@@ -13,4 +17,60 @@ public void init() throws Exception {
1317
super.init();
1418
enableCalcite();
1519
}
20+
21+
@Test
22+
public void testParseErrorInvalidGroupNameUnderscore() throws IOException {
23+
try {
24+
executeQuery(
25+
String.format(
26+
"source=%s | parse email '.+@(?<host_name>.+)' | fields email", TEST_INDEX_BANK));
27+
fail("Should have thrown an exception for underscore in named capture group");
28+
} catch (Exception e) {
29+
assertTrue(e.getMessage().contains("Invalid capture group name 'host_name'"));
30+
assertTrue(
31+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
32+
}
33+
}
34+
35+
@Test
36+
public void testParseErrorInvalidGroupNameHyphen() throws IOException {
37+
try {
38+
executeQuery(
39+
String.format(
40+
"source=%s | parse email '.+@(?<host-name>.+)' | fields email", TEST_INDEX_BANK));
41+
fail("Should have thrown an exception for hyphen in named capture group");
42+
} catch (Exception e) {
43+
assertTrue(e.getMessage().contains("Invalid capture group name 'host-name'"));
44+
assertTrue(
45+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
46+
}
47+
}
48+
49+
@Test
50+
public void testParseErrorInvalidGroupNameStartingWithDigit() throws IOException {
51+
try {
52+
executeQuery(
53+
String.format(
54+
"source=%s | parse email '.+@(?<1host>.+)' | fields email", TEST_INDEX_BANK));
55+
fail("Should have thrown an exception for group name starting with digit");
56+
} catch (Exception e) {
57+
assertTrue(e.getMessage().contains("Invalid capture group name '1host'"));
58+
assertTrue(
59+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
60+
}
61+
}
62+
63+
@Test
64+
public void testParseErrorInvalidGroupNameSpecialCharacter() throws IOException {
65+
try {
66+
executeQuery(
67+
String.format(
68+
"source=%s | parse email '.+@(?<host@name>.+)' | fields email", TEST_INDEX_BANK));
69+
fail("Should have thrown an exception for special character in named capture group");
70+
} catch (Exception e) {
71+
assertTrue(e.getMessage().contains("Invalid capture group name 'host@name'"));
72+
assertTrue(
73+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
74+
}
75+
}
1676
}

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,70 @@ public void testRexErrorNoNamedGroups() throws IOException {
5050
}
5151
}
5252

53+
@Test
54+
public void testRexErrorInvalidGroupNameUnderscore() throws IOException {
55+
try {
56+
executeQuery(
57+
String.format(
58+
"source=%s | rex field=email \\\"(?<user_name>[^@]+)@(?<domain>.+)\\\" | fields"
59+
+ " email",
60+
TEST_INDEX_ACCOUNT));
61+
fail("Should have thrown an exception for underscore in named capture group");
62+
} catch (Exception e) {
63+
assertTrue(e.getMessage().contains("Invalid capture group name 'user_name'"));
64+
assertTrue(
65+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
66+
}
67+
}
68+
69+
@Test
70+
public void testRexErrorInvalidGroupNameHyphen() throws IOException {
71+
try {
72+
executeQuery(
73+
String.format(
74+
"source=%s | rex field=email \\\"(?<user-name>[^@]+)@(?<domain>.+)\\\" | fields"
75+
+ " email",
76+
TEST_INDEX_ACCOUNT));
77+
fail("Should have thrown an exception for hyphen in named capture group");
78+
} catch (Exception e) {
79+
assertTrue(e.getMessage().contains("Invalid capture group name 'user-name'"));
80+
assertTrue(
81+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
82+
}
83+
}
84+
85+
@Test
86+
public void testRexErrorInvalidGroupNameStartingWithDigit() throws IOException {
87+
try {
88+
executeQuery(
89+
String.format(
90+
"source=%s | rex field=email \\\"(?<1user>[^@]+)@(?<domain>.+)\\\" | fields"
91+
+ " email",
92+
TEST_INDEX_ACCOUNT));
93+
fail("Should have thrown an exception for group name starting with digit");
94+
} catch (Exception e) {
95+
assertTrue(e.getMessage().contains("Invalid capture group name '1user'"));
96+
assertTrue(
97+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
98+
}
99+
}
100+
101+
@Test
102+
public void testRexErrorInvalidGroupNameSpecialCharacter() throws IOException {
103+
try {
104+
executeQuery(
105+
String.format(
106+
"source=%s | rex field=email \\\"(?<user@name>[^@]+)@(?<domain>.+)\\\" | fields"
107+
+ " email",
108+
TEST_INDEX_ACCOUNT));
109+
fail("Should have thrown an exception for special character in named capture group");
110+
} catch (Exception e) {
111+
assertTrue(e.getMessage().contains("Invalid capture group name 'user@name'"));
112+
assertTrue(
113+
e.getMessage().contains("must start with a letter and contain only letters and digits"));
114+
}
115+
}
116+
53117
@Test
54118
public void testRexWithFiltering() throws IOException {
55119
JSONObject result =

0 commit comments

Comments
 (0)