dolthub · angelamayxie · Jul 10, 2025 · Jul 1, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/enginetest/queries/logic_test_scripts.go b/enginetest/queries/logic_test_scripts.go
@@ -1004,39 +1004,41 @@ var SQLLogicSubqueryTests = []ScriptTest{
 			},
 		},
 	},
-	//{
-	//	Name: "multiple nested subquery",
-	//	SetUpScript: []string{
-	//		"CREATE TABLE `groups`(id SERIAL PRIMARY KEY, data JSON);",
-	//		"INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 1\", \"members\": [{\"name\": \"admin\", \"type\": \"USER\"}, {\"name\": \"user\", \"type\": \"USER\"}]}');",
-	//		"INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 2\", \"members\": [{\"name\": \"admin2\", \"type\": \"USER\"}]}');",
-	//		"CREATE TABLE t32786 (id VARCHAR(36) PRIMARY KEY, parent_id VARCHAR(36), parent_path text);",
-	//		"INSERT INTO t32786 VALUES ('3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null, null);",
-	//		"INSERT INTO t32786 VALUES ('5AE7EAFD-8277-4F41-83DE-0FD4B4482169', '3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null);",
-	//		"CREATE TABLE users (id INT8 NOT NULL, name VARCHAR(50), PRIMARY KEY (id));",
-	//		"INSERT INTO users(id, name) VALUES (1, 'user1');",
-	//		"INSERT INTO users(id, name) VALUES (2, 'user2');",
-	//        "INSERT INTO users(id, name) VALUES (3, 'user3');",
-	//        "CREATE TABLE stuff(id INT8 NOT NULL, date DATE, user_id INT8, PRIMARY KEY (id), FOREIGN KEY (user_id) REFERENCES users (id));",
-	//        "INSERT INTO stuff(id, date, user_id) VALUES (1, '2007-10-15', 1);",
-	//        "INSERT INTO stuff(id, date, user_id) VALUES (2, '2007-12-15', 1);",
-	//        "INSERT INTO stuff(id, date, user_id) VALUES (3, '2007-11-15', 1);",
-	//        "INSERT INTO stuff(id, date, user_id) VALUES (4, '2008-01-15', 2);",
-	//        "INSERT INTO stuff(id, date, user_id) VALUES (5, '2007-06-15', 3);",
-	//        "INSERT INTO stuff(id, date, user_id) VALUES (6, '2007-03-15', 3);",
-	//	},
-	//	Assertions: []ScriptTestAssertion{
-	//		{
-	//			Skip: true,
-	//			Query: "SELECT users.id AS users_id, users.name AS users_name, stuff_1.id AS stuff_1_id, stuff_1.date AS stuff_1_date, stuff_1.user_id AS stuff_1_user_id FROM users LEFT JOIN stuff AS stuff_1 ON users.id = stuff_1.user_id AND stuff_1.id = (SELECT stuff_2.id FROM stuff AS stuff_2 WHERE stuff_2.user_id = users.id ORDER BY stuff_2.date DESC LIMIT 1) ORDER BY users.name;",
-	//			Expected: []sql.Row{
-	//				{1, "user1", 2, 2007-12-15, 1},
-	//				{2, "user2", 4, 2008-01-15, 2},
-	//				{3, "user3", 5, 2007-06-15, 3},
-	//			},
-	//		},
-	//	},
-	//},
+	{
+		// Skipping because we don't convert Time objects to strings in enginetests
+		Skip: true,
+		Name: "multiple nested subquery",
+		SetUpScript: []string{
+			"CREATE TABLE `groups`(id SERIAL PRIMARY KEY, data JSON);",
+			"INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 1\", \"members\": [{\"name\": \"admin\", \"type\": \"USER\"}, {\"name\": \"user\", \"type\": \"USER\"}]}');",
+			"INSERT INTO `groups`(data) VALUES('{\"name\": \"Group 2\", \"members\": [{\"name\": \"admin2\", \"type\": \"USER\"}]}');",
+			"CREATE TABLE t32786 (id VARCHAR(36) PRIMARY KEY, parent_id VARCHAR(36), parent_path text);",
+			"INSERT INTO t32786 VALUES ('3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null, null);",
+			"INSERT INTO t32786 VALUES ('5AE7EAFD-8277-4F41-83DE-0FD4B4482169', '3AAA2577-DBC3-47E7-9E85-9CC7E19CF48A', null);",
+			"CREATE TABLE users (id INT8 NOT NULL, name VARCHAR(50), PRIMARY KEY (id));",
+			"INSERT INTO users(id, name) VALUES (1, 'user1');",
+			"INSERT INTO users(id, name) VALUES (2, 'user2');",
+			"INSERT INTO users(id, name) VALUES (3, 'user3');",
+			"CREATE TABLE stuff(id INT8 NOT NULL, date DATE, user_id INT8, PRIMARY KEY (id), FOREIGN KEY (user_id) REFERENCES users (id));",
+			"INSERT INTO stuff(id, date, user_id) VALUES (1, '2007-10-15', 1);",
+			"INSERT INTO stuff(id, date, user_id) VALUES (2, '2007-12-15', 1);",
+			"INSERT INTO stuff(id, date, user_id) VALUES (3, '2007-11-15', 1);",
+			"INSERT INTO stuff(id, date, user_id) VALUES (4, '2008-01-15', 2);",
+			"INSERT INTO stuff(id, date, user_id) VALUES (5, '2007-06-15', 3);",
+			"INSERT INTO stuff(id, date, user_id) VALUES (6, '2007-03-15', 3);",
+		},
+		Assertions: []ScriptTestAssertion{
+			{
+				Skip:  true,
+				Query: "SELECT users.id AS users_id, users.name AS users_name, stuff_1.id AS stuff_1_id, stuff_1.date AS stuff_1_date, stuff_1.user_id AS stuff_1_user_id FROM users LEFT JOIN stuff AS stuff_1 ON users.id = stuff_1.user_id AND stuff_1.id = (SELECT stuff_2.id FROM stuff AS stuff_2 WHERE stuff_2.user_id = users.id ORDER BY stuff_2.date DESC LIMIT 1) ORDER BY users.name;",
+				Expected: []sql.Row{
+					{1, "user1", 2, "2007 - 12 - 15", 1},
+					{2, "user2", 4, "2008 - 01 - 15", 2},
+					{3, "user3", 5, "2007 - 06 - 15", 3},
+				},
+			},
+		},
+	},
 	{
 		Name: "multiple nested subquery again",
 		SetUpScript: []string{

diff --git a/enginetest/queries/queries.go b/enginetest/queries/queries.go
@@ -828,10 +828,6 @@ var QueryTests = []QueryTest{
 		Query:    "select y as x from xy group by (y) having AVG(x) > 0",
 		Expected: []sql.Row{{0}, {1}, {3}},
 	},
-	// {
-	//	Query:    "select y as z from xy group by (y) having AVG(z) > 0",
-	//	Expected: []sql.Row{{1}, {2}, {3}},
-	// },
 	{
 		Query:    "SELECT * FROM mytable t0 INNER JOIN mytable t1 ON (t1.i IN (((true)%(''))));",
 		Expected: []sql.Row{},
@@ -10381,6 +10377,28 @@ from typestable`,
 			{2, "second row"},
 		},
 	},
+	{
+		Query: "select * from two_pk group by pk1, pk2",
+		Expected: []sql.Row{
+			{0, 0, 0, 1, 2, 3, 4},
+			{0, 1, 10, 11, 12, 13, 14},
+			{1, 0, 20, 21, 22, 23, 24},
+			{1, 1, 30, 31, 32, 33, 34},
+		},
+	},
+	{
+		Query: "select pk1+1 from two_pk group by pk1 + 1, mod(pk2, 2)",
+		Expected: []sql.Row{
+			{1}, {1}, {2}, {2},
+		},
+	},
+	{
+		Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)",
+		Expected: []sql.Row{
+			// mod is a Decimal type, which we convert to a string in our enginetests
+			{"0"}, {"1"}, {"0"}, {"1"},
+		},
+	},
 }
 
 var KeylessQueries = []QueryTest{
@@ -10632,6 +10650,20 @@ FROM mytable;`,
 			{"DECIMAL"},
 		},
 	},
+	// https://github.com/dolthub/dolt/issues/7095
+	// References in group by and having should be allowed to match select aliases
+	{
+		Query:    "select y as z from xy group by (y) having AVG(z) > 0",
+		Expected: []sql.Row{{1}, {2}, {3}},
+	},
+	{
+		Query:    "select y as z from xy group by (z) having AVG(z) > 0",
+		Expected: []sql.Row{{1}, {2}, {3}},
+	},
+	{
+		Query:    "select y + 1 as z from xy group by (z) having AVG(z) > 1",
+		Expected: []sql.Row{{2}, {3}, {4}},
+	},
 }
 
 var VersionedQueries = []QueryTest{
@@ -11399,6 +11431,15 @@ var ErrorQueries = []QueryErrorTest{
 		Query:       "SELECT 1 INTO mytable;",
 		ExpectedErr: sql.ErrUndeclaredVariable,
 	},
+	{
+		Query:       "select * from two_pk group by pk1",
+		ExpectedErr: analyzererrors.ErrValidationGroupBy,
+	},
+	{
+		// Grouping over functions and math expressions over PK does not count, and must appear in select
+		Query:       "select * from two_pk group by pk1 + 1, mod(pk2, 2)",
+		ExpectedErr: analyzererrors.ErrValidationGroupBy,
+	},
 }
 
 var BrokenErrorQueries = []QueryErrorTest{
@@ -11415,10 +11456,10 @@ var BrokenErrorQueries = []QueryErrorTest{
 		ExpectedErr: sql.ErrTableNotFound,
 	},
 
-	// Our behavior in when sql_mode = ONLY_FULL_GROUP_BY is inconsistent with MySQL
+	// Our behavior in when sql_mode = ONLY_FULL_GROUP_BY is inconsistent with MySQL. This is because we skip validation
+	// for GroupBys wrapped in a Project since we are not able to validate selected expressions that get optimized as an
+	// alias.
 	// Relevant issue: https://github.com/dolthub/dolt/issues/4998
-	// Special case: If you are grouping by every field of the PK, then you can select anything
-	// Otherwise, whatever you are selecting must be in the Group By (with the exception of aggregations)
 	{
 		Query:       "SELECT col0, floor(col1) FROM tab1 GROUP by col0;",
 		ExpectedErr: analyzererrors.ErrValidationGroupBy,
@@ -11427,39 +11468,11 @@ var BrokenErrorQueries = []QueryErrorTest{
 		Query:       "SELECT floor(cor0.col1) * ceil(cor0.col0) AS col2 FROM tab1 AS cor0 GROUP BY cor0.col0",
 		ExpectedErr: analyzererrors.ErrValidationGroupBy,
 	},
-	{
-		Query: "select * from two_pk group by pk1, pk2",
-		// No error
-	},
-	{
-		Query:       "select * from two_pk group by pk1",
-		ExpectedErr: analyzererrors.ErrValidationGroupBy,
-	},
-	{
-		// Grouping over functions and math expressions over PK does not count, and must appear in select
-		Query:       "select * from two_pk group by pk1 + 1, mod(pk2, 2)",
-		ExpectedErr: analyzererrors.ErrValidationGroupBy,
-	},
-	{
-		// Grouping over functions and math expressions over PK does not count, and must appear in select
-		Query: "select pk1+1 from two_pk group by pk1 + 1, mod(pk2, 2)",
-		// No error
-	},
-	{
-		// Grouping over functions and math expressions over PK does not count, and must appear in select
-		Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)",
-		// No error
-	},
-	{
-		// Grouping over functions and math expressions over PK does not count, and must appear in select
-		Query: "select mod(pk2, 2) from two_pk group by pk1 + 1, mod(pk2, 2)",
-		// No error
-	},
 	{
 		Query: `SELECT any_value(pk), (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) AS x
 						FROM one_pk opk WHERE (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) > 0
 						GROUP BY (SELECT max(pk) FROM one_pk WHERE pk < opk.pk) ORDER BY x`,
-		// No error, but we get opk.pk does not exist
+		// No error, but we get opk.pk does not exist (aliasing error)
 	},
 	// Unimplemented JSON functions
 	{

diff --git a/sql/analyzer/validation_rules.go b/sql/analyzer/validation_rules.go
@@ -262,8 +262,9 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop
 
 		switch parent.(type) {
 		case *plan.Having, *plan.Project, *plan.Sort:
-			// TODO: these shouldn't be skipped; you can group by primary key without problem b/c only one value
-			// https://dev.mysql.com/doc/refman/8.0/en/group-by-handling.html#:~:text=The%20query%20is%20valid%20if%20name%20is%20a%20primary%20key
+			// TODO: these shouldn't be skipped but we currently aren't able to validate GroupBys with selected aliased
+			// expressions and a lot of our tests group by aliases
+			// https://github.com/dolthub/dolt/issues/4998
 			return true
 		}
 
@@ -273,17 +274,34 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop
 			return true
 		}
 
-		var groupBys []string
+		primaryKeys := make(map[string]bool)
+		for _, col := range gb.Child.Schema() {
+			if col.PrimaryKey {
+				primaryKeys[strings.ToLower(col.String())] = true
+			}
+		}
+
+		groupBys := make(map[string]bool)
+		groupByPrimaryKeys := 0
 		for _, expr := range gb.GroupByExprs {
-			groupBys = append(groupBys, expr.String())
+			exprStr := strings.ToLower(expr.String())
+			groupBys[exprStr] = true
+			if primaryKeys[exprStr] {
+				groupByPrimaryKeys++
+			}
+		}
+
+		// TODO: also allow grouping by unique non-nullable columns
+		if len(primaryKeys) != 0 && groupByPrimaryKeys == len(primaryKeys) {
+			return true
 		}
 
 		for _, expr := range gb.SelectedExprs {
-			if _, ok := expr.(sql.Aggregation); !ok {
-				if !expressionReferencesOnlyGroupBys(groupBys, expr) {
-					err = analyzererrors.ErrValidationGroupBy.New(expr.String())
-					return false
-				}
+			if !expressionReferencesOnlyGroupBys(groupBys, expr) {
+				// TODO: this is currently too restrictive. Dependent columns are fine to reference
+				// https://dev.mysql.com/doc/refman/8.4/en/group-by-functional-dependence.html
+				err = analyzererrors.ErrValidationGroupBy.New(expr.String())
+				return false
 			}
 		}
 		return true
@@ -292,22 +310,14 @@ func validateGroupBy(ctx *sql.Context, a *Analyzer, n sql.Node, scope *plan.Scop
 	return n, transform.SameTree, err
 }
 
-func expressionReferencesOnlyGroupBys(groupBys []string, expr sql.Expression) bool {
+func expressionReferencesOnlyGroupBys(groupBys map[string]bool, expr sql.Expression) bool {
 	valid := true
 	sql.Inspect(expr, func(expr sql.Expression) bool {
 		switch expr := expr.(type) {
 		case nil, sql.Aggregation, *expression.Literal:
 			return false
-		case *expression.Alias, sql.FunctionExpression:
-			if stringContains(groupBys, expr.String()) {
-				return false
-			}
-			return true
-		// cc: https://dev.mysql.com/doc/refman/8.0/en/group-by-handling.html
-		// Each part of the SelectExpr must refer to the aggregated columns in some way
-		// TODO: this isn't complete, it's overly restrictive. Dependant columns are fine to reference.
 		default:
-			if stringContains(groupBys, expr.String()) {
+			if groupBys[strings.ToLower(expr.String())] {
 				return false
 			}
 

diff --git a/sql/column.go b/sql/column.go
@@ -130,6 +130,10 @@ func (c Column) Copy() *Column {
 	return &c
 }
 
+func (c *Column) String() string {
+	return c.Source + "." + c.Name
+}
+
 // TableId is the unique identifier of a table or table alias in a multi-db environment.
 // The long-term goal is to migrate all uses of table name strings to this and minimize places where we
 // construct/inspect TableIDs. By treating this as an opaque identifier, it will be easier to migrate to