diff --git a/enginetest/queries/logic_test_scripts.go b/enginetest/queries/logic_test_scripts.go index 08013a0484..f5bab9e92e 100644 --- a/enginetest/queries/logic_test_scripts.go +++ b/enginetest/queries/logic_test_scripts.go @@ -1004,39 +1004,41 @@ var SQLLogicSubqueryTests = []ScriptTest{ }, }, }, - //{ - // Name: "multiple nested subquery", - // SetUpScript: []string{ - // "CREATE TABLE `groups`(id SERIAL PRIMARY KEY, data JSON);", - // "INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 1\", \"members\": [{\"name\": \"admin\", \"type\": \"USER\"}, {\"name\": \"user\", \"type\": \"USER\"}]}');", - // "INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 2\", \"members\": [{\"name\": \"admin2\", \"type\": \"USER\"}]}');", - // "CREATE TABLE t32786 (id VARCHAR(36) PRIMARY KEY, parent_id VARCHAR(36), parent_path text);", - // "INSERT INTO t32786 VALUES ('3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null, null);", - // "INSERT INTO t32786 VALUES ('5AE7EAFD-8277-4F41-83DE-0FD4B4482169', '3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null);", - // "CREATE TABLE users (id INT8 NOT NULL, name VARCHAR(50), PRIMARY KEY (id));", - // "INSERT INTO users(id, name) VALUES (1, 'user1');", - // "INSERT INTO users(id, name) VALUES (2, 'user2');", - // "INSERT INTO users(id, name) VALUES (3, 'user3');", - // "CREATE TABLE stuff(id INT8 NOT NULL, date DATE, user_id INT8, PRIMARY KEY (id), FOREIGN KEY (user_id) REFERENCES users (id));", - // "INSERT INTO stuff(id, date, user_id) VALUES (1, '2007-10-15', 1);", - // "INSERT INTO stuff(id, date, user_id) VALUES (2, '2007-12-15', 1);", - // "INSERT INTO stuff(id, date, user_id) VALUES (3, '2007-11-15', 1);", - // "INSERT INTO stuff(id, date, user_id) VALUES (4, '2008-01-15', 2);", - // "INSERT INTO stuff(id, date, user_id) VALUES (5, '2007-06-15', 3);", - // "INSERT INTO stuff(id, date, user_id) VALUES (6, '2007-03-15', 3);", - // }, - // Assertions: []ScriptTestAssertion{ - // { - // Skip: true, - // Query: "SELECT users.id AS users_id, users.name AS users_name, stuff_1.id AS stuff_1_id, stuff_1.date AS stuff_1_date, stuff_1.user_id AS stuff_1_user_id FROM users LEFT JOIN stuff AS stuff_1 ON users.id = stuff_1.user_id AND stuff_1.id = (SELECT stuff_2.id FROM stuff AS stuff_2 WHERE stuff_2.user_id = users.id ORDER BY stuff_2.date DESC LIMIT 1) ORDER BY users.name;", - // Expected: []sql.Row{ - // {1, "user1", 2, 2007-12-15, 1}, - // {2, "user2", 4, 2008-01-15, 2}, - // {3, "user3", 5, 2007-06-15, 3}, - // }, - // }, - // }, - //}, + { + // Skipping because we don't convert Time objects to strings in enginetests + Skip: true, + Name: "multiple nested subquery", + SetUpScript: []string{ + "CREATE TABLE `groups`(id SERIAL PRIMARY KEY, data JSON);", + "INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 1\", \"members\": [{\"name\": \"admin\", \"type\": \"USER\"}, {\"name\": \"user\", \"type\": \"USER\"}]}');", + "INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 2\", \"members\": [{\"name\": \"admin2\", \"type\": \"USER\"}]}');", + "CREATE TABLE t32786 (id VARCHAR(36) PRIMARY KEY, parent_id VARCHAR(36), parent_path text);", + "INSERT INTO t32786 VALUES ('3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null, null);", + "INSERT INTO t32786 VALUES ('5AE7EAFD-8277-4F41-83DE-0FD4B4482169', '3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null);", + "CREATE TABLE users (id INT8 NOT NULL, name VARCHAR(50), PRIMARY KEY (id));", + "INSERT INTO users(id, name) VALUES (1, 'user1');", + "INSERT INTO users(id, name) VALUES (2, 'user2');", + "INSERT INTO users(id, name) VALUES (3, 'user3');", + "CREATE TABLE stuff(id INT8 NOT NULL, date DATE, user_id INT8, PRIMARY KEY (id), FOREIGN KEY (user_id) REFERENCES users (id));", + "INSERT INTO stuff(id, date, user_id) VALUES (1, '2007-10-15', 1);", + "INSERT INTO stuff(id, date, user_id) VALUES (2, '2007-12-15', 1);", + "INSERT INTO stuff(id, date, user_id) VALUES (3, '2007-11-15', 1);", + "INSERT INTO stuff(id, date, user_id) VALUES (4, '2008-01-15', 2);", + "INSERT INTO stuff(id, date, user_id) VALUES (5, '2007-06-15', 3);", + "INSERT INTO stuff(id, date, user_id) VALUES (6, '2007-03-15', 3);", + }, + Assertions: []ScriptTestAssertion{ + { + Skip: true, + Query: "SELECT users.id AS users_id, users.name AS users_name, stuff_1.id AS stuff_1_id, stuff_1.date AS stuff_1_date, stuff_1.user_id AS stuff_1_user_id FROM users LEFT JOIN stuff AS stuff_1 ON users.id = stuff_1.user_id AND stuff_1.id = (SELECT stuff_2.id FROM stuff AS stuff_2 WHERE stuff_2.user_id = users.id ORDER BY stuff_2.date DESC LIMIT 1) ORDER BY users.name;", + Expected: []sql.Row{ + {1, "user1", 2, "2007 - 12 - 15", 1}, + {2, "user2", 4, "2008 - 01 - 15", 2}, + {3, "user3", 5, "2007 - 06 - 15", 3}, + }, + }, + }, + }, { Name: "multiple nested subquery again", SetUpScript: []string{ diff --git a/enginetest/queries/queries.go b/enginetest/queries/queries.go index 991ee3577e..98e821a7fb 100644 --- a/enginetest/queries/queries.go +++ b/enginetest/queries/queries.go @@ -828,10 +828,6 @@ var QueryTests = []QueryTest{ Query: "select y as x from xy group by (y) having AVG(x) > 0", Expected: []sql.Row{{0}, {1}, {3}}, }, - // { - // Query: "select y as z from xy group by (y) having AVG(z) > 0", - // Expected: []sql.Row{{1}, {2}, {3}}, - // }, { Query: "SELECT * FROM mytable t0 INNER JOIN mytable t1 ON (t1.i IN (((true)%(''))));", Expected: []sql.Row{}, @@ -10381,6 +10377,28 @@ from typestable`, {2, "second row"}, }, }, + { + Query: "select * from two_pk group by pk1, pk2", + Expected: []sql.Row{ + {0, 0, 0, 1, 2, 3, 4}, + {0, 1, 10, 11, 12, 13, 14}, + {1, 0, 20, 21, 22, 23, 24}, + {1, 1, 30, 31, 32, 33, 34}, + }, + }, + { + Query: "select pk1+1 from two_pk group by pk1 + 1, mod(pk2, 2)", + Expected: []sql.Row{ + {1}, {1}, {2}, {2}, + }, + }, + { + Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)", + Expected: []sql.Row{ + // mod is a Decimal type, which we convert to a string in our enginetests + {"0"}, {"1"}, {"0"}, {"1"}, + }, + }, } var KeylessQueries = []QueryTest{ @@ -10632,6 +10650,20 @@ FROM mytable;`, {"DECIMAL"}, }, }, + // https://github.com/dolthub/dolt/issues/7095 + // References in group by and having should be allowed to match select aliases + { + Query: "select y as z from xy group by (y) having AVG(z) > 0", + Expected: []sql.Row{{1}, {2}, {3}}, + }, + { + Query: "select y as z from xy group by (z) having AVG(z) > 0", + Expected: []sql.Row{{1}, {2}, {3}}, + }, + { + Query: "select y + 1 as z from xy group by (z) having AVG(z) > 1", + Expected: []sql.Row{{2}, {3}, {4}}, + }, } var VersionedQueries = []QueryTest{ @@ -11399,6 +11431,15 @@ var ErrorQueries = []QueryErrorTest{ Query: "SELECT 1 INTO mytable;", ExpectedErr: sql.ErrUndeclaredVariable, }, + { + Query: "select * from two_pk group by pk1", + ExpectedErr: analyzererrors.ErrValidationGroupBy, + }, + { + // Grouping over functions and math expressions over PK does not count, and must appear in select + Query: "select * from two_pk group by pk1 + 1, mod(pk2, 2)", + ExpectedErr: analyzererrors.ErrValidationGroupBy, + }, } var BrokenErrorQueries = []QueryErrorTest{ @@ -11415,10 +11456,10 @@ var BrokenErrorQueries = []QueryErrorTest{ ExpectedErr: sql.ErrTableNotFound, }, - // Our behavior in when sql_mode = ONLY_FULL_GROUP_BY is inconsistent with MySQL + // Our behavior in when sql_mode = ONLY_FULL_GROUP_BY is inconsistent with MySQL. This is because we skip validation + // for GroupBys wrapped in a Project since we are not able to validate selected expressions that get optimized as an + // alias. // Relevant issue: https://github.com/dolthub/dolt/issues/4998 - // Special case: If you are grouping by every field of the PK, then you can select anything - // Otherwise, whatever you are selecting must be in the Group By (with the exception of aggregations) { Query: "SELECT col0, floor(col1) FROM tab1 GROUP by col0;", ExpectedErr: analyzererrors.ErrValidationGroupBy, @@ -11427,39 +11468,11 @@ var BrokenErrorQueries = []QueryErrorTest{ Query: "SELECT floor(cor0.col1) * ceil(cor0.col0) AS col2 FROM tab1 AS cor0 GROUP BY cor0.col0", ExpectedErr: analyzererrors.ErrValidationGroupBy, }, - { - Query: "select * from two_pk group by pk1, pk2", - // No error - }, - { - Query: "select * from two_pk group by pk1", - ExpectedErr: analyzererrors.ErrValidationGroupBy, - }, - { - // Grouping over functions and math expressions over PK does not count, and must appear in select - Query: "select * from two_pk group by pk1 + 1, mod(pk2, 2)", - ExpectedErr: analyzererrors.ErrValidationGroupBy, - }, - { - // Grouping over functions and math expressions over PK does not count, and must appear in select - Query: "select pk1+1 from two_pk group by pk1 + 1, mod(pk2, 2)", - // No error - }, - { - // Grouping over functions and math expressions over PK does not count, and must appear in select - Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)", - // No error - }, - { - // Grouping over functions and math expressions over PK does not count, and must appear in select - Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)", - // No error - }, { Query: `SELECT any_value(pk), (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) AS x FROM one_pk opk WHERE (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) > 0 GROUP BY (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) ORDER BY x`, - // No error, but we get opk.pk does not exist + // No error, but we get opk.pk does not exist (aliasing error) }, // Unimplemented JSON functions { diff --git a/sql/analyzer/validation_rules.go b/sql/analyzer/validation_rules.go index 85db26bad8..b8e0cc50b5 100644 --- a/sql/analyzer/validation_rules.go +++ b/sql/analyzer/validation_rules.go @@ -262,8 +262,9 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop switch parent.(type) { case *plan.Having, *plan.Project, *plan.Sort: - // TODO: these shouldn't be skipped; you can group by primary key without problem b/c only one value - // https://dev.mysql.com/doc/refman/8.0/en/group-by-handling.html#:~:text=The%20query%20is%20valid%20if%20name%20is%20a%20primary%20key + // TODO: these shouldn't be skipped but we currently aren't able to validate GroupBys with selected aliased + // expressions and a lot of our tests group by aliases + // https://github.com/dolthub/dolt/issues/4998 return true } @@ -273,17 +274,34 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop return true } - var groupBys []string + primaryKeys := make(map[string]bool) + for _, col := range gb.Child.Schema() { + if col.PrimaryKey { + primaryKeys[strings.ToLower(col.String())] = true + } + } + + groupBys := make(map[string]bool) + groupByPrimaryKeys := 0 for _, expr := range gb.GroupByExprs { - groupBys = append(groupBys, expr.String()) + exprStr := strings.ToLower(expr.String()) + groupBys[exprStr] = true + if primaryKeys[exprStr] { + groupByPrimaryKeys++ + } + } + + // TODO: also allow grouping by unique non-nullable columns + if len(primaryKeys) != 0 && groupByPrimaryKeys == len(primaryKeys) { + return true } for _, expr := range gb.SelectedExprs { - if _, ok := expr.(sql.Aggregation); !ok { - if !expressionReferencesOnlyGroupBys(groupBys, expr) { - err = analyzererrors.ErrValidationGroupBy.New(expr.String()) - return false - } + if !expressionReferencesOnlyGroupBys(groupBys, expr) { + // TODO: this is currently too restrictive. Dependent columns are fine to reference + // https://dev.mysql.com/doc/refman/8.4/en/group-by-functional-dependence.html + err = analyzererrors.ErrValidationGroupBy.New(expr.String()) + return false } } return true @@ -292,22 +310,14 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop return n, transform.SameTree, err } -func expressionReferencesOnlyGroupBys(groupBys []string, expr sql.Expression) bool { +func expressionReferencesOnlyGroupBys(groupBys map[string]bool, expr sql.Expression) bool { valid := true sql.Inspect(expr, func(expr sql.Expression) bool { switch expr := expr.(type) { case nil, sql.Aggregation, *expression.Literal: return false - case *expression.Alias, sql.FunctionExpression: - if stringContains(groupBys, expr.String()) { - return false - } - return true - // cc: https://dev.mysql.com/doc/refman/8.0/en/group-by-handling.html - // Each part of the SelectExpr must refer to the aggregated columns in some way - // TODO: this isn't complete, it's overly restrictive. Dependant columns are fine to reference. default: - if stringContains(groupBys, expr.String()) { + if groupBys[strings.ToLower(expr.String())] { return false } diff --git a/sql/column.go b/sql/column.go index 87121bf23c..5fe8af3fa1 100644 --- a/sql/column.go +++ b/sql/column.go @@ -130,6 +130,10 @@ func (c Column) Copy() *Column { return &c } +func (c *Column) String() string { + return c.Source + "." + c.Name +} + // TableId is the unique identifier of a table or table alias in a multi-db environment. // The long-term goal is to migrate all uses of table name strings to this and minimize places where we // construct/inspect TableIDs. By treating this as an opaque identifier, it will be easier to migrate to