From 4827bd17af4f609c11791a3589d6d0498a6a3e75 Mon Sep 17 00:00:00 2001 From: James Cor Date: Wed, 18 Jun 2025 13:49:51 -0700 Subject: [PATCH 1/8] don't use schema for hashlookups --- sql/plan/hash_lookup.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index f65bdecad2..27b6ec434d 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -127,7 +127,7 @@ func (n *HashLookup) GetHashKey(ctx *sql.Context, e sql.Expression, row sql.Row) return nil, err } if s, ok := key.([]interface{}); ok { - return hash.HashOf(ctx, n.Schema(), s) + return hash.HashOf(ctx, nil, s) } // byte slices are not hashable if k, ok := key.([]byte); ok { From 250b8e0752ae32ca3c230c7d8aeae0200bfbda4e Mon Sep 17 00:00:00 2001 From: James Cor Date: Wed, 18 Jun 2025 13:59:12 -0700 Subject: [PATCH 2/8] use correct schema for hashlookup keys --- sql/plan/hash_lookup.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index 27b6ec434d..de6674c984 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -16,6 +16,7 @@ package plan import ( "fmt" + "github.com/dolthub/go-mysql-server/sql/expression" "sync" "github.com/dolthub/go-mysql-server/sql" @@ -127,7 +128,13 @@ func (n *HashLookup) GetHashKey(ctx *sql.Context, e sql.Expression, row sql.Row) return nil, err } if s, ok := key.([]interface{}); ok { - return hash.HashOf(ctx, nil, s) + var sch sql.Schema + if tup, isTup := e.(*expression.Tuple); isTup { + for _, expr := range tup.Children() { + sch = append(sch, &sql.Column{Type: expr.Type()}) + } + } + return hash.HashOf(ctx, sch, s) } // byte slices are not hashable if k, ok := key.([]byte); ok { From ee61b872b04016ca55cac2b952dda1dfe4045c2b Mon Sep 17 00:00:00 2001 From: jycor Date: Wed, 18 Jun 2025 21:00:35 +0000 Subject: [PATCH 3/8] [ga-format-pr] Run ./format_repo.sh to fix formatting --- sql/plan/hash_lookup.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index de6674c984..77e8d07601 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -16,9 +16,10 @@ package plan import ( "fmt" - "github.com/dolthub/go-mysql-server/sql/expression" "sync" + "github.com/dolthub/go-mysql-server/sql/expression" + "github.com/dolthub/go-mysql-server/sql" "github.com/dolthub/go-mysql-server/sql/hash" "github.com/dolthub/go-mysql-server/sql/types" From c8940d78d21c4a34625af1e3ea73ee5da3ece847 Mon Sep 17 00:00:00 2001 From: James Cor Date: Wed, 18 Jun 2025 14:20:51 -0700 Subject: [PATCH 4/8] add a test --- enginetest/queries/join_queries.go | 19 +++++++++++++++++++ sql/plan/hash_lookup.go | 3 +-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/enginetest/queries/join_queries.go b/enginetest/queries/join_queries.go index ab7aef1901..0045e642c6 100644 --- a/enginetest/queries/join_queries.go +++ b/enginetest/queries/join_queries.go @@ -1161,6 +1161,25 @@ var JoinScriptTests = []ScriptTest{ }, }, }, + { + // Since hash.HashOf takes in a sql.Schema to convert and hash keys, + // we need to pass in the right keys. + Name: "HashLookups regression test", + SetUpScript: []string{ + "create table t1 (i int primary key, j varchar(1), k int);", + "create table t2 (i int primary key, k int);", + "insert into t1 values (111111, 'a', 111111);", + "insert into t2 values (111111, 111111);", + }, + Assertions: []ScriptTestAssertion{ + { + Query: "select /*+ HASH_JOIN(t1, t2) */ * from t1 join t2 on t1.i = t2.i and t1.k = t2.k;", + Expected: []sql.Row{ + {111111, "a", 111111, 111111, 111111}, + }, + }, + }, + }, } var LateralJoinScriptTests = []ScriptTest{ diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index 77e8d07601..e8c7a4b8d7 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -18,9 +18,8 @@ import ( "fmt" "sync" - "github.com/dolthub/go-mysql-server/sql/expression" - "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/expression" "github.com/dolthub/go-mysql-server/sql/hash" "github.com/dolthub/go-mysql-server/sql/types" ) From 0eb0331942628e7f37de8fcf0c8e99a7eac50df6 Mon Sep 17 00:00:00 2001 From: James Cor Date: Fri, 20 Jun 2025 15:01:56 -0700 Subject: [PATCH 5/8] feedback --- enginetest/queries/join_queries.go | 30 ++++++++++++++++++++++-------- sql/hash/hash.go | 11 +++++++++++ sql/plan/hash_lookup.go | 16 +++++++--------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/enginetest/queries/join_queries.go b/enginetest/queries/join_queries.go index 0045e642c6..4eb0dd21fd 100644 --- a/enginetest/queries/join_queries.go +++ b/enginetest/queries/join_queries.go @@ -1162,20 +1162,34 @@ var JoinScriptTests = []ScriptTest{ }, }, { - // Since hash.HashOf takes in a sql.Schema to convert and hash keys, - // we need to pass in the right keys. - Name: "HashLookups regression test", + // After this change: https://github.com/dolthub/go-mysql-server/pull/3038 + // hash.HashOf takes in a sql.Schema to convert and hash keys, so + // we need to pass in the schema of the join key. + // This tests a bug introduced in that same PR where we incorrectly pass in the entire schema, + // resulting in incorrect conversions. + Name: "HashLookup on multiple columns with tables with different schemas", SetUpScript: []string{ - "create table t1 (i int primary key, j varchar(1), k int);", - "create table t2 (i int primary key, k int);", - "insert into t1 values (111111, 'a', 111111);", - "insert into t2 values (111111, 111111);", + "create table t1 (i int primary key, k int);", + "create table t2 (i int primary key, j varchar(1), k int);", + "insert into t1 values (111111, 111111);", + "insert into t2 values (111111, 'a', 111111);", + + "create table tt2 (i int primary key, j varchar(128) collate utf8mb4_0900_ai_ci);", + "create table tt1 (i int primary key, j varchar(128) collate utf8mb4_0900_ai_ci);", + "insert into tt1 values (1, 'ABCDE');", + "insert into tt2 values (1, 'abcde');", }, Assertions: []ScriptTestAssertion{ { Query: "select /*+ HASH_JOIN(t1, t2) */ * from t1 join t2 on t1.i = t2.i and t1.k = t2.k;", Expected: []sql.Row{ - {111111, "a", 111111, 111111, 111111}, + {111111, 111111, 111111, "a", 111111}, + }, + }, + { + Query: "select /*+ HASH_JOIN(tt1, tt2) */ * from tt1 join tt2 on tt1.i = tt2.i and tt1.j = tt2.j;", + Expected: []sql.Row{ + {1, "ABCDE", 1, "abcde"}, }, }, }, diff --git a/sql/hash/hash.go b/sql/hash/hash.go index e37827f7e5..873b0b0bbe 100644 --- a/sql/hash/hash.go +++ b/sql/hash/hash.go @@ -30,6 +30,17 @@ var digestPool = sync.Pool{ }, } +// ExprsToSchema converts a list of sql.Expression to a sql.Schema. +// This is used for functions that use HashOf, but don't already have a schema. +// The generated schema ONLY contains the types of the expressions without any column names or any other info. +func ExprsToSchema(exprs... sql.Expression) sql.Schema { + var sch sql.Schema + for _, expr := range exprs { + sch = append(sch, &sql.Column{Type: expr.Type()}) + } + return sch +} + // HashOf returns a hash of the given value to be used as key in a cache. func HashOf(ctx *sql.Context, sch sql.Schema, row sql.Row) (uint64, error) { hash := digestPool.Get().(*xxhash.Digest) diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index e8c7a4b8d7..7fc1da10a4 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -19,8 +19,7 @@ import ( "sync" "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/expression" - "github.com/dolthub/go-mysql-server/sql/hash" + "github.com/dolthub/go-mysql-server/sql/hash" "github.com/dolthub/go-mysql-server/sql/types" ) @@ -34,12 +33,14 @@ import ( // on the projected results. If cached results are not available, it // simply delegates to the child. func NewHashLookup(n sql.Node, rightEntryKey sql.Expression, leftProbeKey sql.Expression, joinType JoinType) *HashLookup { + leftKeySch := hash.ExprsToSchema(leftProbeKey) return &HashLookup{ UnaryNode: UnaryNode{n}, RightEntryKey: rightEntryKey, LeftProbeKey: leftProbeKey, Mutex: new(sync.Mutex), JoinType: joinType, + leftKeySch: leftKeySch, } } @@ -50,6 +51,7 @@ type HashLookup struct { Mutex *sync.Mutex Lookup *map[interface{}][]sql.Row JoinType JoinType + leftKeySch sql.Schema } var _ sql.Node = (*HashLookup)(nil) @@ -71,6 +73,7 @@ func (n *HashLookup) WithExpressions(exprs ...sql.Expression) (sql.Node, error) ret := *n ret.RightEntryKey = exprs[0] ret.LeftProbeKey = exprs[1] + ret.leftKeySch = hash.ExprsToSchema(ret.LeftProbeKey) return &ret, nil } @@ -128,13 +131,7 @@ func (n *HashLookup) GetHashKey(ctx *sql.Context, e sql.Expression, row sql.Row) return nil, err } if s, ok := key.([]interface{}); ok { - var sch sql.Schema - if tup, isTup := e.(*expression.Tuple); isTup { - for _, expr := range tup.Children() { - sch = append(sch, &sql.Column{Type: expr.Type()}) - } - } - return hash.HashOf(ctx, sch, s) + return hash.HashOf(ctx, n.leftKeySch, s) } // byte slices are not hashable if k, ok := key.([]byte); ok { @@ -146,3 +143,4 @@ func (n *HashLookup) GetHashKey(ctx *sql.Context, e sql.Expression, row sql.Row) func (n *HashLookup) Dispose() { n.Lookup = nil } + From f3ed6e26c6886abda650b2ec09de614ba0e7060c Mon Sep 17 00:00:00 2001 From: James Cor Date: Fri, 20 Jun 2025 15:02:45 -0700 Subject: [PATCH 6/8] formatting --- sql/plan/hash_lookup.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index 7fc1da10a4..101672deb0 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -19,7 +19,7 @@ import ( "sync" "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/hash" + "github.com/dolthub/go-mysql-server/sql/hash" "github.com/dolthub/go-mysql-server/sql/types" ) From 76c0b7a20539ef64f7a5a7b12f2124a43d624e83 Mon Sep 17 00:00:00 2001 From: jycor Date: Fri, 20 Jun 2025 22:04:12 +0000 Subject: [PATCH 7/8] [ga-format-pr] Run ./format_repo.sh to fix formatting --- sql/hash/hash.go | 2 +- sql/plan/hash_lookup.go | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/hash/hash.go b/sql/hash/hash.go index 873b0b0bbe..94bcc64206 100644 --- a/sql/hash/hash.go +++ b/sql/hash/hash.go @@ -33,7 +33,7 @@ var digestPool = sync.Pool{ // ExprsToSchema converts a list of sql.Expression to a sql.Schema. // This is used for functions that use HashOf, but don't already have a schema. // The generated schema ONLY contains the types of the expressions without any column names or any other info. -func ExprsToSchema(exprs... sql.Expression) sql.Schema { +func ExprsToSchema(exprs ...sql.Expression) sql.Schema { var sch sql.Schema for _, expr := range exprs { sch = append(sch, &sql.Column{Type: expr.Type()}) diff --git a/sql/plan/hash_lookup.go b/sql/plan/hash_lookup.go index 101672deb0..7926d29255 100644 --- a/sql/plan/hash_lookup.go +++ b/sql/plan/hash_lookup.go @@ -143,4 +143,3 @@ func (n *HashLookup) GetHashKey(ctx *sql.Context, e sql.Expression, row sql.Row) func (n *HashLookup) Dispose() { n.Lookup = nil } - From ab428c7470eba0fb24832d077a7b4e232aa59cdc Mon Sep 17 00:00:00 2001 From: James Cor Date: Fri, 20 Jun 2025 15:54:08 -0700 Subject: [PATCH 8/8] split tests --- enginetest/queries/join_queries.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/enginetest/queries/join_queries.go b/enginetest/queries/join_queries.go index 4eb0dd21fd..2cd269c03a 100644 --- a/enginetest/queries/join_queries.go +++ b/enginetest/queries/join_queries.go @@ -1173,11 +1173,6 @@ var JoinScriptTests = []ScriptTest{ "create table t2 (i int primary key, j varchar(1), k int);", "insert into t1 values (111111, 111111);", "insert into t2 values (111111, 'a', 111111);", - - "create table tt2 (i int primary key, j varchar(128) collate utf8mb4_0900_ai_ci);", - "create table tt1 (i int primary key, j varchar(128) collate utf8mb4_0900_ai_ci);", - "insert into tt1 values (1, 'ABCDE');", - "insert into tt2 values (1, 'abcde');", }, Assertions: []ScriptTestAssertion{ { @@ -1186,8 +1181,19 @@ var JoinScriptTests = []ScriptTest{ {111111, 111111, 111111, "a", 111111}, }, }, + }, + }, + { + Name: "HashLookup on multiple columns with collations", + SetUpScript: []string{ + "create table t1 (i int primary key, j varchar(128) collate utf8mb4_0900_ai_ci);", + "create table t2 (i int primary key, j varchar(128) collate utf8mb4_0900_ai_ci);", + "insert into t1 values (1, 'ABCDE');", + "insert into t2 values (1, 'abcde');", + }, + Assertions: []ScriptTestAssertion{ { - Query: "select /*+ HASH_JOIN(tt1, tt2) */ * from tt1 join tt2 on tt1.i = tt2.i and tt1.j = tt2.j;", + Query: "select /*+ HASH_JOIN(t1, t2) */ * from t1 join t2 on t1.i = t2.i and t1.j = t2.j;", Expected: []sql.Row{ {1, "ABCDE", 1, "abcde"}, },