Skip to content

Commit 62fac20

Browse files
authored
Merge pull request #3082 from dolthub/angela/groupby
Allow GroupBy primary key (and some test clean up)
2 parents 9d5902e + 87ed65e commit 62fac20

File tree

4 files changed

+117
-88
lines changed

4 files changed

+117
-88
lines changed

enginetest/queries/logic_test_scripts.go

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,39 +1004,41 @@ var SQLLogicSubqueryTests = []ScriptTest{
10041004
},
10051005
},
10061006
},
1007-
//{
1008-
// Name: "multiple nested subquery",
1009-
// SetUpScript: []string{
1010-
// "CREATE TABLE `groups`(id SERIAL PRIMARY KEY, data JSON);",
1011-
// "INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 1\", \"members\": [{\"name\": \"admin\", \"type\": \"USER\"}, {\"name\": \"user\", \"type\": \"USER\"}]}');",
1012-
// "INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 2\", \"members\": [{\"name\": \"admin2\", \"type\": \"USER\"}]}');",
1013-
// "CREATE TABLE t32786 (id VARCHAR(36) PRIMARY KEY, parent_id VARCHAR(36), parent_path text);",
1014-
// "INSERT INTO t32786 VALUES ('3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null, null);",
1015-
// "INSERT INTO t32786 VALUES ('5AE7EAFD-8277-4F41-83DE-0FD4B4482169', '3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null);",
1016-
// "CREATE TABLE users (id INT8 NOT NULL, name VARCHAR(50), PRIMARY KEY (id));",
1017-
// "INSERT INTO users(id, name) VALUES (1, 'user1');",
1018-
// "INSERT INTO users(id, name) VALUES (2, 'user2');",
1019-
// "INSERT INTO users(id, name) VALUES (3, 'user3');",
1020-
// "CREATE TABLE stuff(id INT8 NOT NULL, date DATE, user_id INT8, PRIMARY KEY (id), FOREIGN KEY (user_id) REFERENCES users (id));",
1021-
// "INSERT INTO stuff(id, date, user_id) VALUES (1, '2007-10-15', 1);",
1022-
// "INSERT INTO stuff(id, date, user_id) VALUES (2, '2007-12-15', 1);",
1023-
// "INSERT INTO stuff(id, date, user_id) VALUES (3, '2007-11-15', 1);",
1024-
// "INSERT INTO stuff(id, date, user_id) VALUES (4, '2008-01-15', 2);",
1025-
// "INSERT INTO stuff(id, date, user_id) VALUES (5, '2007-06-15', 3);",
1026-
// "INSERT INTO stuff(id, date, user_id) VALUES (6, '2007-03-15', 3);",
1027-
// },
1028-
// Assertions: []ScriptTestAssertion{
1029-
// {
1030-
// Skip: true,
1031-
// Query: "SELECT users.id AS users_id, users.name AS users_name, stuff_1.id AS stuff_1_id, stuff_1.date AS stuff_1_date, stuff_1.user_id AS stuff_1_user_id FROM users LEFT JOIN stuff AS stuff_1 ON users.id = stuff_1.user_id AND stuff_1.id = (SELECT stuff_2.id FROM stuff AS stuff_2 WHERE stuff_2.user_id = users.id ORDER BY stuff_2.date DESC LIMIT 1) ORDER BY users.name;",
1032-
// Expected: []sql.Row{
1033-
// {1, "user1", 2, 2007-12-15, 1},
1034-
// {2, "user2", 4, 2008-01-15, 2},
1035-
// {3, "user3", 5, 2007-06-15, 3},
1036-
// },
1037-
// },
1038-
// },
1039-
//},
1007+
{
1008+
// Skipping because we don't convert Time objects to strings in enginetests
1009+
Skip: true,
1010+
Name: "multiple nested subquery",
1011+
SetUpScript: []string{
1012+
"CREATE TABLE `groups`(id SERIAL PRIMARY KEY, data JSON);",
1013+
"INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 1\", \"members\": [{\"name\": \"admin\", \"type\": \"USER\"}, {\"name\": \"user\", \"type\": \"USER\"}]}');",
1014+
"INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 2\", \"members\": [{\"name\": \"admin2\", \"type\": \"USER\"}]}');",
1015+
"CREATE TABLE t32786 (id VARCHAR(36) PRIMARY KEY, parent_id VARCHAR(36), parent_path text);",
1016+
"INSERT INTO t32786 VALUES ('3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null, null);",
1017+
"INSERT INTO t32786 VALUES ('5AE7EAFD-8277-4F41-83DE-0FD4B4482169', '3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null);",
1018+
"CREATE TABLE users (id INT8 NOT NULL, name VARCHAR(50), PRIMARY KEY (id));",
1019+
"INSERT INTO users(id, name) VALUES (1, 'user1');",
1020+
"INSERT INTO users(id, name) VALUES (2, 'user2');",
1021+
"INSERT INTO users(id, name) VALUES (3, 'user3');",
1022+
"CREATE TABLE stuff(id INT8 NOT NULL, date DATE, user_id INT8, PRIMARY KEY (id), FOREIGN KEY (user_id) REFERENCES users (id));",
1023+
"INSERT INTO stuff(id, date, user_id) VALUES (1, '2007-10-15', 1);",
1024+
"INSERT INTO stuff(id, date, user_id) VALUES (2, '2007-12-15', 1);",
1025+
"INSERT INTO stuff(id, date, user_id) VALUES (3, '2007-11-15', 1);",
1026+
"INSERT INTO stuff(id, date, user_id) VALUES (4, '2008-01-15', 2);",
1027+
"INSERT INTO stuff(id, date, user_id) VALUES (5, '2007-06-15', 3);",
1028+
"INSERT INTO stuff(id, date, user_id) VALUES (6, '2007-03-15', 3);",
1029+
},
1030+
Assertions: []ScriptTestAssertion{
1031+
{
1032+
Skip: true,
1033+
Query: "SELECT users.id AS users_id, users.name AS users_name, stuff_1.id AS stuff_1_id, stuff_1.date AS stuff_1_date, stuff_1.user_id AS stuff_1_user_id FROM users LEFT JOIN stuff AS stuff_1 ON users.id = stuff_1.user_id AND stuff_1.id = (SELECT stuff_2.id FROM stuff AS stuff_2 WHERE stuff_2.user_id = users.id ORDER BY stuff_2.date DESC LIMIT 1) ORDER BY users.name;",
1034+
Expected: []sql.Row{
1035+
{1, "user1", 2, "2007 - 12 - 15", 1},
1036+
{2, "user2", 4, "2008 - 01 - 15", 2},
1037+
{3, "user3", 5, "2007 - 06 - 15", 3},
1038+
},
1039+
},
1040+
},
1041+
},
10401042
{
10411043
Name: "multiple nested subquery again",
10421044
SetUpScript: []string{

enginetest/queries/queries.go

Lines changed: 49 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -828,10 +828,6 @@ var QueryTests = []QueryTest{
828828
Query: "select y as x from xy group by (y) having AVG(x) > 0",
829829
Expected: []sql.Row{{0}, {1}, {3}},
830830
},
831-
// {
832-
// Query: "select y as z from xy group by (y) having AVG(z) > 0",
833-
// Expected: []sql.Row{{1}, {2}, {3}},
834-
// },
835831
{
836832
Query: "SELECT * FROM mytable t0 INNER JOIN mytable t1 ON (t1.i IN (((true)%(''))));",
837833
Expected: []sql.Row{},
@@ -10798,6 +10794,28 @@ from typestable`,
1079810794
{2, "second row"},
1079910795
},
1080010796
},
10797+
{
10798+
Query: "select * from two_pk group by pk1, pk2",
10799+
Expected: []sql.Row{
10800+
{0, 0, 0, 1, 2, 3, 4},
10801+
{0, 1, 10, 11, 12, 13, 14},
10802+
{1, 0, 20, 21, 22, 23, 24},
10803+
{1, 1, 30, 31, 32, 33, 34},
10804+
},
10805+
},
10806+
{
10807+
Query: "select pk1+1 from two_pk group by pk1 + 1, mod(pk2, 2)",
10808+
Expected: []sql.Row{
10809+
{1}, {1}, {2}, {2},
10810+
},
10811+
},
10812+
{
10813+
Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)",
10814+
Expected: []sql.Row{
10815+
// mod is a Decimal type, which we convert to a string in our enginetests
10816+
{"0"}, {"1"}, {"0"}, {"1"},
10817+
},
10818+
},
1080110819
}
1080210820

1080310821
var KeylessQueries = []QueryTest{
@@ -11049,6 +11067,20 @@ FROM mytable;`,
1104911067
{"DECIMAL"},
1105011068
},
1105111069
},
11070+
// https://github.com/dolthub/dolt/issues/7095
11071+
// References in group by and having should be allowed to match select aliases
11072+
{
11073+
Query: "select y as z from xy group by (y) having AVG(z) > 0",
11074+
Expected: []sql.Row{{1}, {2}, {3}},
11075+
},
11076+
{
11077+
Query: "select y as z from xy group by (z) having AVG(z) > 0",
11078+
Expected: []sql.Row{{1}, {2}, {3}},
11079+
},
11080+
{
11081+
Query: "select y + 1 as z from xy group by (z) having AVG(z) > 1",
11082+
Expected: []sql.Row{{2}, {3}, {4}},
11083+
},
1105211084
}
1105311085

1105411086
var VersionedQueries = []QueryTest{
@@ -11816,6 +11848,15 @@ var ErrorQueries = []QueryErrorTest{
1181611848
Query: "SELECT 1 INTO mytable;",
1181711849
ExpectedErr: sql.ErrUndeclaredVariable,
1181811850
},
11851+
{
11852+
Query: "select * from two_pk group by pk1",
11853+
ExpectedErr: analyzererrors.ErrValidationGroupBy,
11854+
},
11855+
{
11856+
// Grouping over functions and math expressions over PK does not count, and must appear in select
11857+
Query: "select * from two_pk group by pk1 + 1, mod(pk2, 2)",
11858+
ExpectedErr: analyzererrors.ErrValidationGroupBy,
11859+
},
1181911860
}
1182011861

1182111862
var BrokenErrorQueries = []QueryErrorTest{
@@ -11832,10 +11873,10 @@ var BrokenErrorQueries = []QueryErrorTest{
1183211873
ExpectedErr: sql.ErrTableNotFound,
1183311874
},
1183411875

11835-
// Our behavior in when sql_mode = ONLY_FULL_GROUP_BY is inconsistent with MySQL
11876+
// Our behavior in when sql_mode = ONLY_FULL_GROUP_BY is inconsistent with MySQL. This is because we skip validation
11877+
// for GroupBys wrapped in a Project since we are not able to validate selected expressions that get optimized as an
11878+
// alias.
1183611879
// Relevant issue: https://github.com/dolthub/dolt/issues/4998
11837-
// Special case: If you are grouping by every field of the PK, then you can select anything
11838-
// Otherwise, whatever you are selecting must be in the Group By (with the exception of aggregations)
1183911880
{
1184011881
Query: "SELECT col0, floor(col1) FROM tab1 GROUP by col0;",
1184111882
ExpectedErr: analyzererrors.ErrValidationGroupBy,
@@ -11844,39 +11885,11 @@ var BrokenErrorQueries = []QueryErrorTest{
1184411885
Query: "SELECT floor(cor0.col1) * ceil(cor0.col0) AS col2 FROM tab1 AS cor0 GROUP BY cor0.col0",
1184511886
ExpectedErr: analyzererrors.ErrValidationGroupBy,
1184611887
},
11847-
{
11848-
Query: "select * from two_pk group by pk1, pk2",
11849-
// No error
11850-
},
11851-
{
11852-
Query: "select * from two_pk group by pk1",
11853-
ExpectedErr: analyzererrors.ErrValidationGroupBy,
11854-
},
11855-
{
11856-
// Grouping over functions and math expressions over PK does not count, and must appear in select
11857-
Query: "select * from two_pk group by pk1 + 1, mod(pk2, 2)",
11858-
ExpectedErr: analyzererrors.ErrValidationGroupBy,
11859-
},
11860-
{
11861-
// Grouping over functions and math expressions over PK does not count, and must appear in select
11862-
Query: "select pk1+1 from two_pk group by pk1 + 1, mod(pk2, 2)",
11863-
// No error
11864-
},
11865-
{
11866-
// Grouping over functions and math expressions over PK does not count, and must appear in select
11867-
Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)",
11868-
// No error
11869-
},
11870-
{
11871-
// Grouping over functions and math expressions over PK does not count, and must appear in select
11872-
Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)",
11873-
// No error
11874-
},
1187511888
{
1187611889
Query: `SELECT any_value(pk), (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) AS x
1187711890
FROM one_pk opk WHERE (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) > 0
1187811891
GROUP BY (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) ORDER BY x`,
11879-
// No error, but we get opk.pk does not exist
11892+
// No error, but we get opk.pk does not exist (aliasing error)
1188011893
},
1188111894
// Unimplemented JSON functions
1188211895
{

sql/analyzer/validation_rules.go

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,9 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop
262262

263263
switch parent.(type) {
264264
case *plan.Having, *plan.Project, *plan.Sort:
265-
// TODO: these shouldn't be skipped; you can group by primary key without problem b/c only one value
266-
// https://dev.mysql.com/doc/refman/8.0/en/group-by-handling.html#:~:text=The%20query%20is%20valid%20if%20name%20is%20a%20primary%20key
265+
// TODO: these shouldn't be skipped but we currently aren't able to validate GroupBys with selected aliased
266+
// expressions and a lot of our tests group by aliases
267+
// https://github.com/dolthub/dolt/issues/4998
267268
return true
268269
}
269270

@@ -273,17 +274,34 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop
273274
return true
274275
}
275276

276-
var groupBys []string
277+
primaryKeys := make(map[string]bool)
278+
for _, col := range gb.Child.Schema() {
279+
if col.PrimaryKey {
280+
primaryKeys[strings.ToLower(col.String())] = true
281+
}
282+
}
283+
284+
groupBys := make(map[string]bool)
285+
groupByPrimaryKeys := 0
277286
for _, expr := range gb.GroupByExprs {
278-
groupBys = append(groupBys, expr.String())
287+
exprStr := strings.ToLower(expr.String())
288+
groupBys[exprStr] = true
289+
if primaryKeys[exprStr] {
290+
groupByPrimaryKeys++
291+
}
292+
}
293+
294+
// TODO: also allow grouping by unique non-nullable columns
295+
if len(primaryKeys) != 0 && groupByPrimaryKeys == len(primaryKeys) {
296+
return true
279297
}
280298

281299
for _, expr := range gb.SelectedExprs {
282-
if _, ok := expr.(sql.Aggregation); !ok {
283-
if !expressionReferencesOnlyGroupBys(groupBys, expr) {
284-
err = analyzererrors.ErrValidationGroupBy.New(expr.String())
285-
return false
286-
}
300+
if !expressionReferencesOnlyGroupBys(groupBys, expr) {
301+
// TODO: this is currently too restrictive. Dependent columns are fine to reference
302+
// https://dev.mysql.com/doc/refman/8.4/en/group-by-functional-dependence.html
303+
err = analyzererrors.ErrValidationGroupBy.New(expr.String())
304+
return false
287305
}
288306
}
289307
return true
@@ -292,22 +310,14 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop
292310
return n, transform.SameTree, err
293311
}
294312

295-
func expressionReferencesOnlyGroupBys(groupBys []string, expr sql.Expression) bool {
313+
func expressionReferencesOnlyGroupBys(groupBys map[string]bool, expr sql.Expression) bool {
296314
valid := true
297315
sql.Inspect(expr, func(expr sql.Expression) bool {
298316
switch expr := expr.(type) {
299317
case nil, sql.Aggregation, *expression.Literal:
300318
return false
301-
case *expression.Alias, sql.FunctionExpression:
302-
if stringContains(groupBys, expr.String()) {
303-
return false
304-
}
305-
return true
306-
// cc: https://dev.mysql.com/doc/refman/8.0/en/group-by-handling.html
307-
// Each part of the SelectExpr must refer to the aggregated columns in some way
308-
// TODO: this isn't complete, it's overly restrictive. Dependant columns are fine to reference.
309319
default:
310-
if stringContains(groupBys, expr.String()) {
320+
if groupBys[strings.ToLower(expr.String())] {
311321
return false
312322
}
313323

sql/column.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,10 @@ func (c Column) Copy() *Column {
130130
return &c
131131
}
132132

133+
func (c *Column) String() string {
134+
return c.Source + "." + c.Name
135+
}
136+
133137
// TableId is the unique identifier of a table or table alias in a multi-db environment.
134138
// The long-term goal is to migrate all uses of table name strings to this and minimize places where we
135139
// construct/inspect TableIDs. By treating this as an opaque identifier, it will be easier to migrate to

0 commit comments

Comments
 (0)