Skip to content

Commit c290aec

Browse files
authored
Merge pull request #2863 from dolthub/daylon/regex-funcs
Added regexp_instr and regexp_substr
2 parents 8637b99 + ff2d313 commit c290aec

File tree

9 files changed

+654
-37
lines changed

9 files changed

+654
-37
lines changed

enginetest/engine_only_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,47 @@ func TestRegex(t *testing.T) {
781781
},
782782
},
783783
},
784+
{
785+
Name: "REGEXP caching behavior",
786+
SetUpScript: []string{
787+
"CREATE TABLE test (v1 TEXT, v2 INT, v3 INT);",
788+
"INSERT INTO test VALUES ('abc', 1, 2), ('[d-i]+', 2, 3), ('ghi', 3, 4);",
789+
},
790+
Assertions: []queries.ScriptTestAssertion{
791+
{
792+
Query: "SELECT REGEXP_LIKE('abc def ghi', 'abc') FROM test;",
793+
Expected: []sql.Row{{1}, {1}, {1}},
794+
},
795+
{
796+
Query: "SELECT REGEXP_LIKE('abc def ghi', v1) FROM test;",
797+
Expected: []sql.Row{{1}, {1}, {1}},
798+
},
799+
{
800+
Query: "SELECT REGEXP_INSTR('abc def ghi', '[a-z]+', 1, 2) FROM test;",
801+
Expected: []sql.Row{{5}, {5}, {5}},
802+
},
803+
{
804+
Query: "SELECT REGEXP_INSTR('abc def ghi', v1, 1, 1) FROM test;",
805+
Expected: []sql.Row{{1}, {5}, {9}},
806+
},
807+
{
808+
Query: "SELECT REGEXP_INSTR('abc def ghi', '[a-z]+', v2, v3) FROM test;",
809+
Expected: []sql.Row{{5}, {9}, {0}},
810+
},
811+
{
812+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', 1, 2) FROM test;",
813+
Expected: []sql.Row{{"def"}, {"def"}, {"def"}},
814+
},
815+
{
816+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', v1, 1, 1) FROM test;",
817+
Expected: []sql.Row{{"abc"}, {"def"}, {"ghi"}},
818+
},
819+
{
820+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', v2, v3) FROM test;",
821+
Expected: []sql.Row{{"def"}, {"ghi"}, {nil}},
822+
},
823+
},
824+
},
784825
} {
785826
enginetest.TestScript(t, harness, test)
786827
}

enginetest/queries/regex_queries.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2107,4 +2107,52 @@ var RegexTests = []RegexTest{
21072107
Query: `SELECT REGEXP_LIKE("abc", "^([ab]*?)(?<!(a))c");`,
21082108
Expected: []sql.Row{{1}},
21092109
},
2110+
{
2111+
Query: "SELECT REGEXP_INSTR('dog cat dog', 'dog');",
2112+
Expected: []sql.Row{{1}},
2113+
},
2114+
{
2115+
Query: "SELECT REGEXP_INSTR('dog cat dog', 'doggo');",
2116+
Expected: []sql.Row{{0}},
2117+
},
2118+
{
2119+
Query: "SELECT REGEXP_INSTR('dog cat dog', 'dog', 2);",
2120+
Expected: []sql.Row{{9}},
2121+
},
2122+
{
2123+
Query: "SELECT REGEXP_INSTR('dog cat dog', 'dog', 1, 2);",
2124+
Expected: []sql.Row{{9}},
2125+
},
2126+
{
2127+
Query: "SELECT REGEXP_INSTR('aa aaa aaaa', 'a{2}');",
2128+
Expected: []sql.Row{{1}},
2129+
},
2130+
{
2131+
Query: "SELECT REGEXP_INSTR('aa aaa aaaa', 'a{4}');",
2132+
Expected: []sql.Row{{8}},
2133+
},
2134+
{
2135+
Query: "SELECT REGEXP_INSTR('dog cat dog', 'dog', 1, -1, 0);",
2136+
Expected: []sql.Row{{1}},
2137+
},
2138+
{
2139+
Query: "SELECT REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 1);",
2140+
Expected: []sql.Row{{4}},
2141+
},
2142+
{
2143+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+');",
2144+
Expected: []sql.Row{{"abc"}},
2145+
},
2146+
{
2147+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', 1, 3);",
2148+
Expected: []sql.Row{{"ghi"}},
2149+
},
2150+
{
2151+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', 2, 2);",
2152+
Expected: []sql.Row{{"def"}},
2153+
},
2154+
{
2155+
Query: "SELECT REGEXP_SUBSTR('abc def ghi', '[j-z]+');",
2156+
Expected: []sql.Row{{nil}},
2157+
},
21102158
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module github.com/dolthub/go-mysql-server
33
require (
44
github.com/cespare/xxhash/v2 v2.2.0
55
github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2
6-
github.com/dolthub/go-icu-regex v0.0.0-20241215010122-db690dd53c90
6+
github.com/dolthub/go-icu-regex v0.0.0-20250303123116-549b8d7cad00
77
github.com/dolthub/jsonpath v0.0.2-0.20240227200619-19675ab05c71
88
github.com/dolthub/sqllogictest/go v0.0.0-20201107003712-816f3ae12d81
99
github.com/dolthub/vitess v0.0.0-20250228011932-c4f6bba87730

go.sum

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,12 @@ github.com/denisenkom/go-mssqldb v0.10.0/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27
5252
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
5353
github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2 h1:u3PMzfF8RkKd3lB9pZ2bfn0qEG+1Gms9599cr0REMww=
5454
github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2/go.mod h1:mIEZOHnFx4ZMQeawhw9rhsj+0zwQj7adVsnBX7t+eKY=
55-
github.com/dolthub/go-icu-regex v0.0.0-20241215010122-db690dd53c90 h1:Sni8jrP0sy/w9ZYXoff4g/ixe+7bFCZlfCqXKJSU+zM=
56-
github.com/dolthub/go-icu-regex v0.0.0-20241215010122-db690dd53c90/go.mod h1:ylU4XjUpsMcvl/BKeRRMXSH7e7WBrPXdSLvnRJYrxEA=
55+
github.com/dolthub/go-icu-regex v0.0.0-20250303123116-549b8d7cad00 h1:rh2ij2yTYKJWlX+c8XRg4H5OzqPewbU1lPK8pcfVmx8=
56+
github.com/dolthub/go-icu-regex v0.0.0-20250303123116-549b8d7cad00/go.mod h1:ylU4XjUpsMcvl/BKeRRMXSH7e7WBrPXdSLvnRJYrxEA=
5757
github.com/dolthub/jsonpath v0.0.2-0.20240227200619-19675ab05c71 h1:bMGS25NWAGTEtT5tOBsCuCrlYnLRKpbJVJkDbrTRhwQ=
5858
github.com/dolthub/jsonpath v0.0.2-0.20240227200619-19675ab05c71/go.mod h1:2/2zjLQ/JOOSbbSboojeg+cAwcRV0fDLzIiWch/lhqI=
5959
github.com/dolthub/sqllogictest/go v0.0.0-20201107003712-816f3ae12d81 h1:7/v8q9XGFa6q5Ap4Z/OhNkAMBaK5YeuEzwJt+NZdhiE=
6060
github.com/dolthub/sqllogictest/go v0.0.0-20201107003712-816f3ae12d81/go.mod h1:siLfyv2c92W1eN/R4QqG/+RjjX5W2+gCTRjZxBjI3TY=
61-
github.com/dolthub/vitess v0.0.0-20250123002143-3b45b8cacbfa h1:kyoPzxViSXAyqfO0Mab7Qo1UogFIrxZKKyBU6kBOl+E=
62-
github.com/dolthub/vitess v0.0.0-20250123002143-3b45b8cacbfa/go.mod h1:1gQZs/byeHLMSul3Lvl3MzioMtOW1je79QYGyi2fd70=
63-
github.com/dolthub/vitess v0.0.0-20250214225328-a0ed4612b41c h1:YsZuBsU5wKmwrXGfzhW6/a+XzP/LWfzayXC3nCz/kqQ=
64-
github.com/dolthub/vitess v0.0.0-20250214225328-a0ed4612b41c/go.mod h1:1gQZs/byeHLMSul3Lvl3MzioMtOW1je79QYGyi2fd70=
6561
github.com/dolthub/vitess v0.0.0-20250228011932-c4f6bba87730 h1:GtlMVB7+Z7fZZj7BHRFd2rzxZ574dJ8cB/EHWdq1kbY=
6662
github.com/dolthub/vitess v0.0.0-20250228011932-c4f6bba87730/go.mod h1:1gQZs/byeHLMSul3Lvl3MzioMtOW1je79QYGyi2fd70=
6763
github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
// Copyright 2025 Dolthub, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package function
16+
17+
import (
18+
"fmt"
19+
"strings"
20+
"sync"
21+
22+
regex "github.com/dolthub/go-icu-regex"
23+
24+
"github.com/dolthub/go-mysql-server/sql"
25+
"github.com/dolthub/go-mysql-server/sql/expression"
26+
"github.com/dolthub/go-mysql-server/sql/types"
27+
)
28+
29+
// RegexpInstr implements the REGEXP_INSTR function.
30+
// https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-instr
31+
type RegexpInstr struct {
32+
Text sql.Expression
33+
Pattern sql.Expression
34+
Position sql.Expression
35+
Occurrence sql.Expression
36+
ReturnOption sql.Expression
37+
Flags sql.Expression
38+
39+
cachedVal any
40+
cacheRegex bool
41+
cacheVal bool
42+
re regex.Regex
43+
compileOnce sync.Once
44+
compileErr error
45+
}
46+
47+
var _ sql.FunctionExpression = (*RegexpInstr)(nil)
48+
var _ sql.CollationCoercible = (*RegexpInstr)(nil)
49+
var _ sql.Disposable = (*RegexpInstr)(nil)
50+
51+
// NewRegexpInstr creates a new RegexpInstr expression.
52+
func NewRegexpInstr(args ...sql.Expression) (sql.Expression, error) {
53+
var r *RegexpInstr
54+
switch len(args) {
55+
case 6:
56+
r = &RegexpInstr{
57+
Text: args[0],
58+
Pattern: args[1],
59+
Position: args[2],
60+
Occurrence: args[3],
61+
ReturnOption: args[4],
62+
Flags: args[5],
63+
}
64+
case 5:
65+
r = &RegexpInstr{
66+
Text: args[0],
67+
Pattern: args[1],
68+
Position: args[2],
69+
Occurrence: args[3],
70+
ReturnOption: args[4],
71+
}
72+
case 4:
73+
r = &RegexpInstr{
74+
Text: args[0],
75+
Pattern: args[1],
76+
Position: args[2],
77+
Occurrence: args[3],
78+
ReturnOption: expression.NewLiteral(0, types.Int32),
79+
}
80+
case 3:
81+
r = &RegexpInstr{
82+
Text: args[0],
83+
Pattern: args[1],
84+
Position: args[2],
85+
Occurrence: expression.NewLiteral(1, types.Int32),
86+
ReturnOption: expression.NewLiteral(0, types.Int32),
87+
}
88+
case 2:
89+
r = &RegexpInstr{
90+
Text: args[0],
91+
Pattern: args[1],
92+
Position: expression.NewLiteral(1, types.Int32),
93+
Occurrence: expression.NewLiteral(1, types.Int32),
94+
ReturnOption: expression.NewLiteral(0, types.Int32),
95+
}
96+
default:
97+
return nil, sql.ErrInvalidArgumentNumber.New("regexp_instr", "2 to 6", len(args))
98+
}
99+
return r, nil
100+
}
101+
102+
// FunctionName implements sql.FunctionExpression
103+
func (r *RegexpInstr) FunctionName() string {
104+
return "regexp_instr"
105+
}
106+
107+
// Description implements sql.FunctionExpression
108+
func (r *RegexpInstr) Description() string {
109+
return "returns the starting index of the substring."
110+
}
111+
112+
// Type implements the sql.Expression interface.
113+
func (r *RegexpInstr) Type() sql.Type { return types.Int32 }
114+
115+
// CollationCoercibility implements the interface sql.CollationCoercible.
116+
func (r *RegexpInstr) CollationCoercibility(ctx *sql.Context) (collation sql.CollationID, coercibility byte) {
117+
leftCollation, leftCoercibility := sql.GetCoercibility(ctx, r.Text)
118+
rightCollation, rightCoercibility := sql.GetCoercibility(ctx, r.Pattern)
119+
return sql.ResolveCoercibility(leftCollation, leftCoercibility, rightCollation, rightCoercibility)
120+
}
121+
122+
// IsNullable implements the sql.Expression interface.
123+
func (r *RegexpInstr) IsNullable() bool { return true }
124+
125+
// Children implements the sql.Expression interface.
126+
func (r *RegexpInstr) Children() []sql.Expression {
127+
var result = []sql.Expression{r.Text, r.Pattern, r.Position, r.Occurrence, r.ReturnOption}
128+
if r.Flags != nil {
129+
result = append(result, r.Flags)
130+
}
131+
return result
132+
}
133+
134+
// Resolved implements the sql.Expression interface.
135+
func (r *RegexpInstr) Resolved() bool {
136+
return r.Text.Resolved() && r.Pattern.Resolved() && r.Position.Resolved() && r.Occurrence.Resolved() &&
137+
r.ReturnOption.Resolved() && (r.Flags == nil || r.Flags.Resolved())
138+
}
139+
140+
// WithChildren implements the sql.Expression interface.
141+
func (r *RegexpInstr) WithChildren(children ...sql.Expression) (sql.Expression, error) {
142+
required := 5
143+
if r.Flags != nil {
144+
required = 6
145+
}
146+
if len(children) != required {
147+
return nil, sql.ErrInvalidChildrenNumber.New(r, len(children), required)
148+
}
149+
return NewRegexpInstr(children...)
150+
}
151+
152+
// String implements the sql.Expression interface.
153+
func (r *RegexpInstr) String() string {
154+
var args []string
155+
for _, e := range r.Children() {
156+
args = append(args, e.String())
157+
}
158+
return fmt.Sprintf("%s(%s)", r.FunctionName(), strings.Join(args, ","))
159+
}
160+
161+
// compile handles compilation of the regex.
162+
func (r *RegexpInstr) compile(ctx *sql.Context, row sql.Row) {
163+
r.compileOnce.Do(func() {
164+
r.cacheRegex = canBeCached(r.Text, r.Pattern, r.Flags)
165+
r.cacheVal = canBeCached(r.Text, r.Pattern, r.Position, r.Occurrence, r.ReturnOption, r.Flags)
166+
if r.cacheRegex {
167+
r.re, r.compileErr = compileRegex(ctx, r.Pattern, r.Text, r.Flags, r.FunctionName(), row)
168+
}
169+
})
170+
if !r.cacheRegex {
171+
if r.re != nil {
172+
if r.compileErr = r.re.Close(); r.compileErr != nil {
173+
return
174+
}
175+
}
176+
r.re, r.compileErr = compileRegex(ctx, r.Pattern, r.Text, r.Flags, r.FunctionName(), row)
177+
}
178+
}
179+
180+
// Eval implements the sql.Expression interface.
181+
func (r *RegexpInstr) Eval(ctx *sql.Context, row sql.Row) (interface{}, error) {
182+
span, ctx := ctx.Span("function.RegexpInstr")
183+
defer span.End()
184+
185+
if r.cachedVal != nil {
186+
return r.cachedVal, nil
187+
}
188+
189+
r.compile(ctx, row)
190+
if r.compileErr != nil {
191+
return nil, r.compileErr
192+
}
193+
if r.re == nil {
194+
return nil, nil
195+
}
196+
197+
text, err := r.Text.Eval(ctx, row)
198+
if err != nil {
199+
return nil, err
200+
}
201+
if text == nil {
202+
return nil, nil
203+
}
204+
text, _, err = types.LongText.Convert(text)
205+
if err != nil {
206+
return nil, err
207+
}
208+
209+
pos, err := r.Position.Eval(ctx, row)
210+
if err != nil {
211+
return nil, err
212+
}
213+
if pos == nil {
214+
return nil, nil
215+
}
216+
pos, _, err = types.Int32.Convert(pos)
217+
if err != nil {
218+
return nil, err
219+
}
220+
221+
occurrence, err := r.Occurrence.Eval(ctx, row)
222+
if err != nil {
223+
return nil, err
224+
}
225+
if occurrence == nil {
226+
return nil, nil
227+
}
228+
occurrence, _, err = types.Int32.Convert(occurrence)
229+
if err != nil {
230+
return nil, err
231+
}
232+
233+
returnOption, err := r.ReturnOption.Eval(ctx, row)
234+
if err != nil {
235+
return nil, err
236+
}
237+
if returnOption == nil {
238+
return nil, nil
239+
}
240+
returnOption, _, err = types.Int32.Convert(returnOption)
241+
if err != nil {
242+
return nil, err
243+
}
244+
245+
err = r.re.SetMatchString(ctx, text.(string))
246+
if err != nil {
247+
return nil, err
248+
}
249+
index, err := r.re.IndexOf(ctx, int(pos.(int32)), int(occurrence.(int32)), returnOption.(int32) == 1)
250+
if err != nil {
251+
return nil, err
252+
}
253+
254+
outVal := int32(index)
255+
if r.cacheVal {
256+
r.cachedVal = outVal
257+
}
258+
return outVal, nil
259+
}
260+
261+
// Dispose implements the sql.Disposable interface.
262+
func (r *RegexpInstr) Dispose() {
263+
if r.re != nil {
264+
_ = r.re.Close()
265+
}
266+
}

0 commit comments

Comments
 (0)