Skip to content

Commit 54db8e7

Browse files
fix: Complete RE2 to POSIX regex conversion (fixes #24) (#73)
Changes: - Modified convertRE2ToPOSIX() to return case-insensitive flag - Added validation for unsupported RE2 features (lookahead, lookbehind, named groups, flags) - Convert non-capturing groups (?:...) to regular groups (...) - Updated callMatches() to use ~* operator for case-insensitive patterns - Added comprehensive regex conversion tests (6 test functions, 25+ test cases) - Updated existing test to handle both ~ and ~* operators - Updated CLAUDE.md with detailed regex conversion documentation Breaking Change: Patterns with unsupported RE2 features now return clear errors instead of silently passing through and causing PostgreSQL syntax errors. Benefits: - (?i) flag now correctly generates ~* operator (case-insensitive matching) - (?:...) non-capturing groups converted to POSIX-compatible (...) groups - Clear error messages for unsupported features prevent runtime failures - Better type safety and debugging 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
1 parent 708e6f6 commit 54db8e7

File tree

4 files changed

+402
-33
lines changed

4 files changed

+402
-33
lines changed

CLAUDE.md

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,26 @@ Full support for CEL comprehensions converted to PostgreSQL UNNEST patterns:
127127

128128
Pattern recognition and conversion logic is in `comprehensions.go`.
129129

130-
### Regex Pattern Matching (v2.8.0)
130+
### Regex Pattern Matching (v2.8.0+)
131131

132132
Supports CEL `matches()` function with automatic RE2 to POSIX regex conversion:
133133
- `field.matches(r"pattern")``field ~ 'pattern'`
134-
- `field.matches(r"(?i)pattern")``field ~* 'pattern'` (case-insensitive)
135-
- Automatic conversion of RE2 patterns to PostgreSQL-compatible POSIX format
134+
- `field.matches(r"(?i)pattern")``field ~* 'pattern'` (case-insensitive, `(?i)` stripped from pattern)
135+
- `field.matches(r"(?:abc)")``field ~ '(abc)'` (non-capturing groups converted to regular groups)
136+
137+
**Automatic Conversions:**
138+
- Case-insensitive flag `(?i)` → Uses `~*` operator, flag stripped from pattern
139+
- Non-capturing groups `(?:...)` → Converted to regular groups `(...)`
140+
- Character classes: `\d``[[:digit:]]`, `\w``[[:alnum:]_]`, `\s``[[:space:]]`
141+
- Word boundaries: `\b``\y`
142+
143+
**Unsupported RE2 Features (will return errors):**
144+
- Lookahead assertions: `(?=...)`, `(?!...)`
145+
- Lookbehind assertions: `(?<=...)`, `(?<!...)`
146+
- Named capture groups: `(?P<name>...)`
147+
- Inline flags other than `(?i)`: `(?m)`, `(?s)`, `(?-i)`, etc.
148+
149+
These validations prevent PostgreSQL syntax errors and ensure predictable behavior.
136150

137151
## Code Quality Requirements
138152

cel2sql.go

Lines changed: 60 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -839,8 +839,6 @@ func (con *converter) callMatches(target *exprpb.Expr, args []*exprpb.Expr) erro
839839
return err
840840
}
841841

842-
con.str.WriteString(" ~ ")
843-
844842
// Visit the pattern expression and convert from RE2 to POSIX if it's a string literal
845843
if constExpr := patternExpr.GetConstExpr(); constExpr != nil && constExpr.GetStringValue() != "" {
846844
// Convert RE2 pattern to POSIX
@@ -851,29 +849,34 @@ func (con *converter) callMatches(target *exprpb.Expr, args []*exprpb.Expr) erro
851849
}
852850

853851
// Convert RE2 to POSIX with security validation
854-
posixPattern, err := convertRE2ToPOSIX(re2Pattern)
852+
posixPattern, caseInsensitive, err := convertRE2ToPOSIX(re2Pattern)
855853
if err != nil {
856854
return fmt.Errorf("invalid regex pattern: %w", err)
857855
}
858856

859-
// Determine case sensitivity
860-
caseInsensitive := strings.HasPrefix(re2Pattern, "(?i)")
861-
862857
con.logger.LogAttrs(context.Background(), slog.LevelDebug,
863858
"regex pattern conversion",
864859
slog.String("original_pattern", re2Pattern),
865860
slog.String("converted_pattern", posixPattern),
866861
slog.Bool("case_insensitive", caseInsensitive),
867862
)
868863

864+
// Use ~* for case-insensitive matching, ~ for case-sensitive
865+
if caseInsensitive {
866+
con.str.WriteString(" ~* ")
867+
} else {
868+
con.str.WriteString(" ~ ")
869+
}
870+
869871
// Write the converted pattern as a string literal
870872
escaped := strings.ReplaceAll(posixPattern, "'", "''")
871873
con.str.WriteString("'")
872874
con.str.WriteString(escaped)
873875
con.str.WriteString("'")
874876
} else {
875877
// For non-literal patterns, we can't convert at compile time
876-
// Just use the pattern as-is and hope it's POSIX compatible
878+
// Just use the pattern as-is with case-sensitive operator
879+
con.str.WriteString(" ~ ")
877880
if err := con.visit(patternExpr); err != nil {
878881
return err
879882
}
@@ -1947,19 +1950,45 @@ func isBinaryOrTernaryOperator(expr *exprpb.Expr) bool {
19471950

19481951
// convertRE2ToPOSIX converts an RE2 regex pattern to POSIX ERE format for PostgreSQL.
19491952
// It performs security validation to prevent ReDoS attacks (CWE-1333).
1953+
// Returns: (posixPattern, caseInsensitive, error)
19501954
// Note: This is a basic conversion for common patterns. Full RE2 to POSIX conversion is complex.
1951-
func convertRE2ToPOSIX(re2Pattern string) (string, error) {
1955+
func convertRE2ToPOSIX(re2Pattern string) (string, bool, error) {
19521956
// 1. Check pattern length to prevent processing extremely long patterns
19531957
if len(re2Pattern) > maxRegexPatternLength {
1954-
return "", fmt.Errorf("regex pattern exceeds maximum length of %d characters", maxRegexPatternLength)
1958+
return "", false, fmt.Errorf("regex pattern exceeds maximum length of %d characters", maxRegexPatternLength)
1959+
}
1960+
1961+
// 2. Extract case-insensitive flag if present
1962+
caseInsensitive := false
1963+
if strings.HasPrefix(re2Pattern, "(?i)") {
1964+
caseInsensitive = true
1965+
re2Pattern = strings.TrimPrefix(re2Pattern, "(?i)")
19551966
}
19561967

1957-
// 2. Detect catastrophic nested quantifiers that cause exponential backtracking
1968+
// 3. Detect unsupported RE2 features and return errors
1969+
// Lookahead assertions
1970+
if strings.Contains(re2Pattern, "(?=") || strings.Contains(re2Pattern, "(?!") {
1971+
return "", false, errors.New("lookahead assertions (?=...), (?!...) are not supported in PostgreSQL POSIX regex")
1972+
}
1973+
// Lookbehind assertions
1974+
if strings.Contains(re2Pattern, "(?<=") || strings.Contains(re2Pattern, "(?<!") {
1975+
return "", false, errors.New("lookbehind assertions (?<=...), (?<!...) are not supported in PostgreSQL POSIX regex")
1976+
}
1977+
// Named capture groups
1978+
if strings.Contains(re2Pattern, "(?P<") {
1979+
return "", false, errors.New("named capture groups (?P<name>...) are not supported in PostgreSQL POSIX regex")
1980+
}
1981+
// Other inline flags (after we've already handled (?i))
1982+
if strings.Contains(re2Pattern, "(?m") || strings.Contains(re2Pattern, "(?s") || strings.Contains(re2Pattern, "(?-") {
1983+
return "", false, errors.New("inline flags other than (?i) are not supported in PostgreSQL POSIX regex")
1984+
}
1985+
1986+
// 4. Detect catastrophic nested quantifiers that cause exponential backtracking
19581987
// Patterns like (a+)+, (a*)*, (x+x+)+, ((a)+b)+, etc. are extremely dangerous
19591988

19601989
// Check for doubled quantifiers
19611990
if matched, _ := regexp.MatchString(`[*+][*+]`, re2Pattern); matched {
1962-
return "", errors.New("regex contains catastrophic nested quantifiers that could cause ReDoS")
1991+
return "", false, errors.New("regex contains catastrophic nested quantifiers that could cause ReDoS")
19631992
}
19641993

19651994
// Check for groups that contain quantifiers and are themselves quantified
@@ -1990,7 +2019,7 @@ func convertRE2ToPOSIX(re2Pattern string) (string, error) {
19902019
if nextChar == '*' || nextChar == '+' || nextChar == '?' || nextChar == '{' {
19912020
// This group is quantified. Check if it contains quantifiers
19922021
if len(groupHasQuantifier) > 0 && groupHasQuantifier[len(groupHasQuantifier)-1] {
1993-
return "", errors.New("regex contains catastrophic nested quantifiers that could cause ReDoS")
2022+
return "", false, errors.New("regex contains catastrophic nested quantifiers that could cause ReDoS")
19942023
}
19952024
}
19962025
}
@@ -2018,21 +2047,21 @@ func convertRE2ToPOSIX(re2Pattern string) (string, error) {
20182047
}
20192048
}
20202049

2021-
// 3. Count and limit capture groups to prevent memory exhaustion
2050+
// 5. Count and limit capture groups to prevent memory exhaustion
20222051
groupCount := strings.Count(re2Pattern, "(") - strings.Count(re2Pattern, `\(`)
20232052
if groupCount > maxRegexGroups {
2024-
return "", fmt.Errorf("regex contains %d capture groups, exceeds maximum of %d", groupCount, maxRegexGroups)
2053+
return "", false, fmt.Errorf("regex contains %d capture groups, exceeds maximum of %d", groupCount, maxRegexGroups)
20252054
}
20262055

2027-
// 4. Detect exponential alternation patterns like (a|a)*b or (a|ab)*
2056+
// 6. Detect exponential alternation patterns like (a|a)*b or (a|ab)*
20282057
alternationPattern := regexp.MustCompile(`\([^)]*\|[^)]*\)[*+]`)
20292058
if alternationPattern.MatchString(re2Pattern) {
20302059
// Check if alternation has overlapping branches (more dangerous)
20312060
// This is a simple heuristic - full analysis would be more complex
2032-
return "", errors.New("regex contains quantified alternation that could cause ReDoS")
2061+
return "", false, errors.New("regex contains quantified alternation that could cause ReDoS")
20332062
}
20342063

2035-
// 5. Check nesting depth to prevent deeply nested patterns
2064+
// 7. Check nesting depth to prevent deeply nested patterns
20362065
maxDepth := 0
20372066
currentDepth := 0
20382067
for _, char := range re2Pattern {
@@ -2046,7 +2075,7 @@ func convertRE2ToPOSIX(re2Pattern string) (string, error) {
20462075
}
20472076
}
20482077
if maxDepth > maxRegexNestingDepth {
2049-
return "", fmt.Errorf("regex nesting depth %d exceeds maximum of %d", maxDepth, maxRegexNestingDepth)
2078+
return "", false, fmt.Errorf("regex nesting depth %d exceeds maximum of %d", maxDepth, maxRegexNestingDepth)
20502079
}
20512080

20522081
// Passed all security checks - proceed with conversion
@@ -2080,16 +2109,19 @@ func convertRE2ToPOSIX(re2Pattern string) (string, error) {
20802109
// 8. Non-whitespace shortcuts: \S -> [^[:space:]]
20812110
posixPattern = strings.ReplaceAll(posixPattern, `\S`, `[^[:space:]]`)
20822111

2083-
// Note: Many RE2 features are not directly convertible to POSIX ERE:
2084-
// - Lookahead/lookbehind assertions (?=...), (?!...), (?<=...), (?<!...)
2085-
// - Non-capturing groups (?:...)
2086-
// - Named groups (?P<name>...)
2087-
// - Case-insensitive flags (?i)
2088-
// - Multiline flags (?m)
2089-
// - Unicode character classes
2112+
// 9. Non-capturing groups: (?:...) -> (...)
2113+
// POSIX ERE doesn't have non-capturing groups, so convert to regular groups
2114+
posixPattern = strings.ReplaceAll(posixPattern, `(?:`, `(`)
2115+
2116+
// Note: Unsupported RE2 features that are now validated and return errors:
2117+
// - Lookahead/lookbehind assertions (?=...), (?!...), (?<=...), (?<!...) - ERROR
2118+
// - Named groups (?P<name>...) - ERROR
2119+
// - Case-insensitive flag (?i) - CONVERTED (returned as separate boolean)
2120+
// - Other inline flags (?m), (?s) - ERROR
20902121
//
2091-
// For these cases, the pattern is returned as-is, which may cause PostgreSQL errors
2092-
// if the pattern uses unsupported RE2 features.
2122+
// Converted features:
2123+
// - Non-capturing groups (?:...) - Converted to regular groups (...)
2124+
// - Character class shortcuts (\d, \w, \s, etc.) - Converted to POSIX equivalents
20932125

2094-
return posixPattern, nil
2126+
return posixPattern, caseInsensitive, nil
20952127
}

parameterized_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package cel2sql_test
22

33
import (
4+
"strings"
45
"testing"
56

67
"github.com/google/cel-go/cel"
@@ -434,9 +435,10 @@ func TestConvertParameterized_RegexPatterns(t *testing.T) {
434435
// Assert parameter count
435436
assert.Len(t, result.Parameters, tt.wantParamCount, "Parameter count should match")
436437

437-
// Assert SQL contains regex operator
438+
// Assert SQL contains regex operator (~ or ~* for case-insensitive)
438439
if tt.containsRegex {
439-
assert.Contains(t, result.SQL, " ~ ", "SQL should contain regex operator")
440+
hasRegexOperator := strings.Contains(result.SQL, " ~ ") || strings.Contains(result.SQL, " ~* ")
441+
assert.True(t, hasRegexOperator, "SQL should contain regex operator (~ or ~*)")
440442
}
441443
})
442444
}

0 commit comments

Comments
 (0)