Skip to content

Commit 7a538b3

Browse files
charlespnhAzanulrafiss
committed
builtins: add levenshtein_less_equal function
This adds the levenshtein_less_equal function with two overloads: one with default costs and one with custom costs. Release note (sql change): Implemented levenshtein_less_equal(string, string, int) and levenshtein_less_equal(string, string, int, int, int, int) built-in functions. Co-authored-by: Charles Nguyen <[email protected]> Co-authored-by: Azanul <[email protected]> Co-authored-by: Rafi Shamim <[email protected]>
1 parent 6228e69 commit 7a538b3

File tree

7 files changed

+599
-8
lines changed

7 files changed

+599
-8
lines changed

docs/generated/sql/functions.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,10 @@ available replica will error.</p>
10681068
</span></td><td>Immutable</td></tr>
10691069
<tr><td><a name="levenshtein"></a><code>levenshtein(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>, ins_cost: <a href="int.html">int</a>, del_cost: <a href="int.html">int</a>, sub_cost: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Calculates the Levenshtein distance between two strings. The cost parameters specify how much to charge for each edit operation. Maximum input length is 255 characters.</p>
10701070
</span></td><td>Immutable</td></tr>
1071+
<tr><td><a name="levenshtein_less_equal"></a><code>levenshtein_less_equal(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>, ins_cost: <a href="int.html">int</a>, del_cost: <a href="int.html">int</a>, sub_cost: <a href="int.html">int</a>, max_d: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Calculates the Levenshtein distance between two strings. The cost parameters specify how much to charge for each edit operation. If actual distance is less or equal then max_d, then it returns the distance. Otherwise this function returns a value greater than max_d. The maximum length of the input strings is 255 characters.</p>
1072+
</span></td><td>Immutable</td></tr>
1073+
<tr><td><a name="levenshtein_less_equal"></a><code>levenshtein_less_equal(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>, max_d: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Calculates the Levenshtein distance between two strings. If actual distance is less or equal then max_d, then it returns the distance. Otherwise this function returns a value greater than max_d. The maximum length of the input strings is 255 characters.</p>
1074+
</span></td><td>Immutable</td></tr>
10711075
<tr><td><a name="metaphone"></a><code>metaphone(source: <a href="string.html">string</a>, max_output_length: <a href="int.html">int</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Convert a string to its Metaphone code. Maximum input length is 255 characters</p>
10721076
</span></td><td>Immutable</td></tr>
10731077
<tr><td><a name="soundex"></a><code>soundex(source: <a href="string.html">string</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Convert a string to its Soundex code.</p>

pkg/sql/logictest/testdata/logic_test/fuzzystrmatch

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,14 @@ INSERT INTO fuzzystrmatch_table VALUES
1212
(3, '😄', '🐯'),
1313
(4, null, 'a'),
1414
(5, 'a', null),
15-
(6, null, null)
15+
(6, null, null),
16+
(7, 'apple', 'apple'),
17+
(8, 'a', 'abcd'),
18+
(9, '', ''),
19+
(10, '', 'abc'),
20+
(11, 'xyz', ''),
21+
(12, 'extensive', 'exhaustive'),
22+
(13, '🌞', 'a')
1623

1724
statement error levenshtein argument exceeds maximum length of 255 characters
1825
SELECT levenshtein(lpad('', 256, 'x'), '')
@@ -29,6 +36,59 @@ apple banana 5 18
2936
NULL a NULL NULL
3037
a NULL NULL NULL
3138
NULL NULL NULL NULL
39+
apple apple 0 0
40+
a abcd 3 6
41+
· · 0 0
42+
· abc 3 6
43+
xyz · 3 9
44+
extensive exhaustive 4 14
45+
🌞 a 1 4
46+
47+
statement error levenshtein_less_equal argument exceeds maximum length of 255 characters
48+
SELECT levenshtein_less_equal(lpad('', 256, 'x'), '', 4)
49+
50+
statement error levenshtein_less_equal argument exceeds maximum length of 255 characters
51+
SELECT levenshtein_less_equal(lpad('', 256, 'x'), '', 4, 2, 3, 4)
52+
53+
statement error levenshtein_less_equal argument exceeds maximum length of 255 characters
54+
SELECT levenshtein_less_equal(repeat('🌞', 256), '', 4)
55+
56+
statement error levenshtein_less_equal argument exceeds maximum length of 255 characters
57+
SELECT levenshtein_less_equal('', repeat('🌞', 256), 4)
58+
59+
query TTII
60+
SELECT a, b, levenshtein_less_equal(a, b, 3), levenshtein_less_equal(a, b, 2, 3, 4, 12) FROM fuzzystrmatch_table ORDER BY id
61+
----
62+
apple banana 4 13
63+
· pear 4 8
64+
😄 🐯 1 4
65+
NULL a NULL NULL
66+
a NULL NULL NULL
67+
NULL NULL NULL NULL
68+
apple apple 0 0
69+
a abcd 3 6
70+
· · 0 0
71+
· abc 3 6
72+
xyz · 3 9
73+
extensive exhaustive 4 13
74+
🌞 a 1 4
75+
76+
# Test cases from PostgreSQL
77+
query I
78+
SELECT levenshtein_less_equal('extensive', 'exhaustive', 2)
79+
----
80+
3
81+
82+
query I
83+
SELECT levenshtein_less_equal('extensive', 'exhaustive', 4)
84+
----
85+
4
86+
87+
# Test negative max distance (should behave like regular levenshtein)
88+
query I
89+
SELECT levenshtein_less_equal('extensive', 'exhaustive', -1)
90+
----
91+
4
3292

3393
query T
3494
SELECT soundex('hello world!')

pkg/sql/pgwire/testdata/pgtest/procedure

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@ until
6767
ReadyForQuery
6868
----
6969
{"Type":"RowDescription","Fields":null}
70-
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"foo","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func379","UnknownFields":null}
70+
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"foo","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func381","UnknownFields":null}
7171
{"Type":"RowDescription","Fields":null}
72-
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"bar","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func379","UnknownFields":null}
72+
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"bar","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func381","UnknownFields":null}
7373
{"Type":"RowDescription","Fields":null}
74-
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"baz","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func379","UnknownFields":null}
74+
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"baz","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func381","UnknownFields":null}
7575
{"Type":"CommandComplete","CommandTag":"CALL"}
7676
{"Type":"ReadyForQuery","TxStatus":"I"}
7777

@@ -87,10 +87,10 @@ ReadyForQuery
8787
----
8888
{"Type":"ParseComplete"}
8989
{"Type":"BindComplete"}
90-
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"foo","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func379","UnknownFields":null}
90+
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"foo","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func381","UnknownFields":null}
9191
{"Type":"RowDescription","Fields":null}
92-
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"bar","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func379","UnknownFields":null}
92+
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"bar","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func381","UnknownFields":null}
9393
{"Type":"RowDescription","Fields":null}
94-
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"baz","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func379","UnknownFields":null}
94+
{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"baz","Detail":"","Hint":"","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"builtins.go","Line":0,"Routine":"func381","UnknownFields":null}
9595
{"Type":"CommandComplete","CommandTag":"CALL"}
9696
{"Type":"ReadyForQuery","TxStatus":"I"}

pkg/sql/sem/builtins/builtins.go

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4104,7 +4104,6 @@ value if you rely on the HLC for accuracy.`,
41044104
Volatility: volatility.Immutable,
41054105
},
41064106
),
4107-
"levenshtein_less_equal": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
41084107
"metaphone": makeBuiltin(
41094108
tree.FunctionProperties{Category: builtinconstants.CategoryFuzzyStringMatching},
41104109
tree.Overload{
@@ -4135,6 +4134,62 @@ value if you rely on the HLC for accuracy.`,
41354134
),
41364135
"dmetaphone": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
41374136
"dmetaphone_alt": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
4137+
"levenshtein_less_equal": makeBuiltin(
4138+
tree.FunctionProperties{Category: builtinconstants.CategoryFuzzyStringMatching},
4139+
tree.Overload{
4140+
Types: tree.ParamTypes{
4141+
{Name: "source", Typ: types.String},
4142+
{Name: "target", Typ: types.String},
4143+
{Name: "max_d", Typ: types.Int},
4144+
},
4145+
ReturnType: tree.FixedReturnType(types.Int),
4146+
Fn: func(_ context.Context, _ *eval.Context, args tree.Datums) (tree.Datum, error) {
4147+
s, t := string(tree.MustBeDString(args[0])), string(tree.MustBeDString(args[1]))
4148+
d := int(tree.MustBeDInt(args[2]))
4149+
4150+
// Same limit as Postgres. Ref: https://github.com/postgres/postgres/blob/53ea2b7ad050ce4ad95c89bb55197209b65886a1/src/backend/utils/adt/levenshtein.c#L26
4151+
const maxLen = 255
4152+
if utf8.RuneCountInString(s) > maxLen || utf8.RuneCountInString(t) > maxLen {
4153+
return nil, pgerror.Newf(pgcode.InvalidParameterValue,
4154+
"levenshtein_less_equal argument exceeds maximum length of %d characters", maxLen)
4155+
}
4156+
ld := fuzzystrmatch.LevenshteinLessEqualDistance(s, t, d)
4157+
return tree.NewDInt(tree.DInt(ld)), nil
4158+
},
4159+
Info: "Calculates the Levenshtein distance between two strings. If actual distance is less or equal then max_d, " +
4160+
"then it returns the distance. Otherwise this function returns a value greater than max_d. " +
4161+
"The maximum length of the input strings is 255 characters.",
4162+
Volatility: volatility.Immutable,
4163+
},
4164+
tree.Overload{
4165+
Types: tree.ParamTypes{
4166+
{Name: "source", Typ: types.String},
4167+
{Name: "target", Typ: types.String},
4168+
{Name: "ins_cost", Typ: types.Int},
4169+
{Name: "del_cost", Typ: types.Int},
4170+
{Name: "sub_cost", Typ: types.Int},
4171+
{Name: "max_d", Typ: types.Int},
4172+
},
4173+
ReturnType: tree.FixedReturnType(types.Int),
4174+
Fn: func(_ context.Context, _ *eval.Context, args tree.Datums) (tree.Datum, error) {
4175+
s, t := string(tree.MustBeDString(args[0])), string(tree.MustBeDString(args[1]))
4176+
ins, del, sub := int(tree.MustBeDInt(args[2])), int(tree.MustBeDInt(args[3])), int(tree.MustBeDInt(args[4]))
4177+
d := int(tree.MustBeDInt(args[5]))
4178+
const maxLen = 255
4179+
if utf8.RuneCountInString(s) > maxLen || utf8.RuneCountInString(t) > maxLen {
4180+
return nil, pgerror.Newf(pgcode.InvalidParameterValue,
4181+
"levenshtein_less_equal argument exceeds maximum length of %d characters", maxLen)
4182+
}
4183+
ld := fuzzystrmatch.LevenshteinLessEqualDistanceWithCost(s, t, ins, del, sub, d)
4184+
return tree.NewDInt(tree.DInt(ld)), nil
4185+
},
4186+
Info: "Calculates the Levenshtein distance between two strings. The cost parameters specify how much to " +
4187+
"charge for each edit operation. If actual distance is less or equal then max_d, " +
4188+
"then it returns the distance. Otherwise this function returns a value greater than max_d. " +
4189+
"The maximum length of the input strings is 255 characters.",
4190+
Volatility: volatility.Immutable,
4191+
},
4192+
),
41384193

41394194
// JSON functions.
41404195
// The behavior of both the JSON and JSONB data types in CockroachDB is

pkg/sql/sem/builtins/fixed_oids.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2857,6 +2857,8 @@ var builtinOidsArray = []string{
28572857
2902: `ltree2text(ltree: ltree) -> string`,
28582858
2903: `lca(ltree, ltree, ltree...) -> ltree`,
28592859
2904: `lca(ltree[]: ltree[]) -> ltree`,
2860+
2905: `levenshtein_less_equal(source: string, target: string, max_d: int) -> int`,
2861+
2906: `levenshtein_less_equal(source: string, target: string, ins_cost: int, del_cost: int, sub_cost: int, max_d: int) -> int`,
28602862
}
28612863

28622864
var builtinOidsBySignature map[string]oid.Oid

0 commit comments

Comments
 (0)