Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
1732eb7
Update `TabletBalancer.Pick` signature to accept options
mhamza15 Aug 16, 2025
f37cdc8
Initial implementation of the hash ring
mhamza15 Aug 16, 2025
114fece
Set up initial health check subscription
mhamza15 Aug 16, 2025
4742e8d
Rename to `SessionBalancer` and create initial tests
mhamza15 Aug 16, 2025
5be7ac7
Improvements and add more tests
mhamza15 Aug 17, 2025
80eb92f
Clarify comment
mhamza15 Aug 17, 2025
c421c3d
Remove brackets
mhamza15 Aug 17, 2025
34ca9e9
Clarify comment
mhamza15 Aug 17, 2025
92abecb
Add session hash to session proto
mhamza15 Aug 17, 2025
41141d7
Clarify comment
mhamza15 Aug 17, 2025
70ed65f
Update `SessionHash` to be optional
mhamza15 Aug 18, 2025
0698c2b
Pass in invalid tablets to tablet balancer
mhamza15 Aug 23, 2025
7159adf
Build initial hash rings
mhamza15 Aug 24, 2025
b77e758
Set up health check subscription first to avoid missing changes
mhamza15 Aug 24, 2025
b38c28f
Hash uuid on pick
mhamza15 Aug 25, 2025
df48aa1
Add invalid tablets to `PickOpts`
mhamza15 Aug 25, 2025
a18bab7
Make session uuid not pointer
mhamza15 Aug 25, 2025
f9956e2
Remove unused import
mhamza15 Aug 26, 2025
d60c593
Remove old tablets when a tablet's target changes
mhamza15 Aug 26, 2025
5833a54
Fetch initial tablet state after subscribing to health check
mhamza15 Aug 26, 2025
5bf666d
Revert "Remove old tablets when a tablet's target changes"
mhamza15 Aug 26, 2025
d8723b3
Remove primary tablets from rings
mhamza15 Aug 26, 2025
479cde4
undo formatting
mhamza15 Aug 26, 2025
475408c
Pass session uuid to `withRetry`
mhamza15 Aug 28, 2025
98a7c8b
Fix new types
mhamza15 Aug 28, 2025
f59a14f
Fix tests
mhamza15 Aug 28, 2025
2620b76
Pass `WrapOpts` by value
mhamza15 Aug 28, 2025
e62c88f
Pass `PickOpts` by value
mhamza15 Aug 28, 2025
0db53cd
Change `ExecuteOptions` in `WrapOpts` as a pointer
mhamza15 Aug 28, 2025
cde392c
Fix `balancer-type` help text
mhamza15 Aug 29, 2025
eadf1ae
Fix some bugs
mhamza15 Aug 30, 2025
8fae28c
Get cell from tablet alias rather than target
mhamza15 Aug 30, 2025
cd30d9d
Initial e2e test
mhamza15 Aug 30, 2025
37b7e62
Add more e2e tests
mhamza15 Aug 31, 2025
0a8ce4b
undo auto fmt
mhamza15 Oct 3, 2025
bdc2ba5
remove tablet from old ring if its target changes
mhamza15 Oct 5, 2025
fa18203
Switch to rendezvous hashing
mhamza15 Oct 7, 2025
b4ecebc
Remove health check
mhamza15 Oct 7, 2025
8dc63cc
Merge remote-tracking branch 'upstream/main' into session-balancer
mhamza15 Dec 9, 2025
01b5bb5
Update balancer.go
mhamza15 Dec 9, 2025
b758326
more conflict changes
mhamza15 Dec 9, 2025
b30017c
remove some allocations
mhamza15 Dec 9, 2025
89455ec
adjust tests to match previous tests
mhamza15 Dec 10, 2025
27a48d8
add benchmarks comparing all balancers
mhamza15 Dec 10, 2025
8236141
add session mode to help text
mhamza15 Dec 10, 2025
f45a400
remove benchmark
mhamza15 Dec 10, 2025
3f4c17e
remove PickOpts.InvalidTablets
mhamza15 Dec 10, 2025
6abc3f7
add promptless changelog suggestion
mhamza15 Dec 11, 2025
1b9b7d9
temp: add session to queryservice
mhamza15 Dec 11, 2025
5f497bf
Revert "temp: add session to queryservice"
mhamza15 Dec 11, 2025
dc61e95
remove random changelog file
mhamza15 Dec 11, 2025
ee8f2b6
Update `QueryService` to include session
mhamza15 Dec 12, 2025
859babc
Merge remote-tracking branch 'upstream/main' into session-balancer
mhamza15 Dec 12, 2025
0060112
fix e2e test
mhamza15 Dec 12, 2025
c00ef7b
fix import cycle
mhamza15 Dec 12, 2025
2ade6a9
try to fix actions
mhamza15 Dec 12, 2025
6531827
fix scatter conn test
mhamza15 Dec 12, 2025
74dbe47
try to fix actions
mhamza15 Dec 12, 2025
61439c5
remove debug message
mhamza15 Dec 12, 2025
b3fa8a6
Merge remote-tracking branch 'upstream/main' into session-balancer
mhamza15 Dec 15, 2025
62d289d
undo ci changes for flakiness
mhamza15 Dec 15, 2025
091aba7
add nil session checks
mhamza15 Dec 15, 2025
6f706d6
remove nil safe sessions
mhamza15 Dec 15, 2025
53664df
simplify pick opts
mhamza15 Dec 16, 2025
de491d1
simplify empty sessions
mhamza15 Dec 16, 2025
19e11c1
goimports
mhamza15 Dec 16, 2025
a4eb6e1
Update go/vt/vtgate/balancer/balancer.go
mhamza15 Dec 16, 2025
f63dcd6
switch pick opts to functional options
mhamza15 Dec 16, 2025
a3b213a
Update options.go
mhamza15 Dec 17, 2025
ef62ea0
Merge branch 'main' into session-balancer
mhamza15 Dec 17, 2025
5d6d428
Test connection stickiness
mhamza15 Dec 17, 2025
8fccc83
Merge branch 'main' into session-balancer
mhamza15 Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions changelog/24.0/24.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- **[Minor Changes](#minor-changes)**
- **[VTGate](#minor-changes-vtgate)**
- [New default for `--legacy-replication-lag-algorithm` flag](#vtgate-new-default-legacy-replication-lag-algorithm)
- [New "session" mode for `--vtgate-balancer-mode` flag](#vtgate-session-balancer-mode)
- **[Query Serving](#minor-changes-query-serving)**
- [JSON_EXTRACT now supports dynamic path arguments](#query-serving-json-extract-dynamic-args)
- **[VTTablet](#minor-changes-vttablet)**
Expand Down Expand Up @@ -42,6 +43,16 @@ Instead, a simpler algorithm purely based on low lag, high lag and minimum numbe

In v25 this flag will become deprecated and in the following release it will be removed. In the meantime, the legacy behaviour can be used by setting `--legacy-replication-lag-algorithm=true`. This deprecation is tracked in https://github.com/vitessio/vitess/issues/18914.

#### <a id="vtgate-session-balancer-mode"/>New "session" mode for `--vtgate-balancer-mode` flag</a>

The VTGate flag `--vtgate-balancer-mode` now supports a new "session" mode in addition to the existing "cell", "prefer-cell", and "random" modes. Session mode routes each session consistently to the same tablet for the session's duration.

To enable session mode, set the flag when starting VTGate:

```
--vtgate-balancer-mode=session
```

### <a id="minor-changes-query-serving"/>Query Serving</a>

#### <a id="query-serving-json-extract-dynamic-args"/>JSON_EXTRACT now supports dynamic path arguments</a>
Expand Down
2 changes: 1 addition & 1 deletion go/flags/endtoend/vtgate.txt
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ Flags:
-v, --version print binary version
--vmodule vModuleFlag comma-separated list of pattern=N settings for file-filtered logging
--vschema-ddl-authorized-users string List of users authorized to execute vschema ddl operations, or '%' to allow all users.
--vtgate-balancer-mode string Tablet balancer mode (options: cell, prefer-cell, random). Defaults to 'cell' which shuffles tablets in the local cell.
--vtgate-balancer-mode string Tablet balancer mode (options: cell, prefer-cell, random, session). Defaults to 'cell' which shuffles tablets in the local cell.
--vtgate-config-terse-errors prevent bind vars from escaping in returned errors
--warming-reads-concurrency int Number of concurrent warming reads allowed (default 500)
--warming-reads-percent int Percentage of reads on the primary to forward to replicas. Useful for keeping buffer pools warm
Expand Down
15 changes: 9 additions & 6 deletions go/test/endtoend/cluster/cluster_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,14 @@ import (
"vitess.io/vitess/go/vt/grpcclient"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/vterrors"
"vitess.io/vitess/go/vt/vtgate/executorcontext"
"vitess.io/vitess/go/vt/vtgate/planbuilder/plancontext"
"vitess.io/vitess/go/vt/vtgate/vtgateconn"
"vitess.io/vitess/go/vt/vttablet/tabletconn"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vtgatepb "vitess.io/vitess/go/vt/proto/vtgate"
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"

// Ensure dialers are registered (needed by ExecOnTablet and ExecOnVTGate).
Expand Down Expand Up @@ -807,7 +809,7 @@ func NewBareCluster(cell string, hostname string) *LocalProcessCluster {
// path/to/whatever exists
cluster.ReusingVTDATAROOT = true
} else {
err = createDirectory(cluster.CurrentVTDATAROOT, 0700)
err = createDirectory(cluster.CurrentVTDATAROOT, 0o700)
if err != nil {
log.Fatal(err)
}
Expand Down Expand Up @@ -948,11 +950,12 @@ func (cluster *LocalProcessCluster) ExecOnTablet(ctx context.Context, vttablet *

txID, reservedID := 0, 0

return conn.Execute(ctx, &querypb.Target{
session := executorcontext.NewSafeSession(&vtgatepb.Session{Options: opts})
return conn.Execute(ctx, session, &querypb.Target{
Keyspace: tablet.Keyspace,
Shard: tablet.Shard,
TabletType: tablet.Type,
}, sql, bindvars, int64(txID), int64(reservedID), opts)
}, sql, bindvars, int64(txID), int64(reservedID))
}

// ExecOnVTGate executes a query on a local cluster VTGate with the provided
Expand Down Expand Up @@ -1165,7 +1168,8 @@ func (cluster *LocalProcessCluster) waitForMySQLProcessToExit(mysqlctlProcessLis

// StartVtbackup starts a vtbackup
func (cluster *LocalProcessCluster) StartVtbackup(newInitDBFile string, initialBackup bool,
keyspace string, shard string, cell string, extraArgs ...string) error {
keyspace string, shard string, cell string, extraArgs ...string,
) error {
log.Info("Starting vtbackup")
cluster.VtbackupProcess = *VtbackupProcessInstance(
cluster.GetAndReserveTabletUID(),
Expand Down Expand Up @@ -1195,7 +1199,6 @@ func (cluster *LocalProcessCluster) GetAndReservePort() int {
cluster.nextPortForProcess = cluster.nextPortForProcess + 1
log.Infof("Attempting to reserve port: %v", cluster.nextPortForProcess)
ln, err := net.Listen("tcp", net.JoinHostPort("127.0.0.1", strconv.Itoa(cluster.nextPortForProcess)))

if err != nil {
log.Errorf("Can't listen on port %v: %s, trying next port", cluster.nextPortForProcess, err)
continue
Expand All @@ -1218,7 +1221,7 @@ const portFileTimeout = 1 * time.Hour
// If yes, then return that port, and save port + 200 in the same file
// here, assumptions is 200 ports might be consumed for all tests in a package
func getPort() int {
portFile, err := os.OpenFile(path.Join(os.TempDir(), "endtoend.port"), os.O_CREATE|os.O_RDWR, 0644)
portFile, err := os.OpenFile(path.Join(os.TempDir(), "endtoend.port"), os.O_CREATE|os.O_RDWR, 0o644)
if err != nil {
panic(err)
}
Expand Down
223 changes: 223 additions & 0 deletions go/test/endtoend/vtgate/tabletbalancer/session_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
/*
Copyright 2025 The Vitess Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package tabletbalancer

import (
"context"
"fmt"
"testing"
"time"

"github.com/stretchr/testify/require"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/vt/vtgate/planbuilder/plancontext"
)

// TestSessionModeBalancer tests the "session" mode routes each session consistently to the same tablet.
func TestSessionModeBalancer(t *testing.T) {
vtgateProcess, vtParams, _, _ := setupCluster(t)
defer vtgateProcess.TearDown()

// Create 2 session connections that route to different tablets
conns := createSessionConnections(t, &vtParams, 2)
for conn := range conns {
defer conn.Close()
}

verifyStickiness(t, conns, 20)
}

// TestSessionModeRemoveTablet tests that when a tablet is killed, connections switch to remaining tablets
func TestSessionModeRemoveTablet(t *testing.T) {
vtgateProcess, vtParams, replicaTablets, aliases := setupCluster(t)
defer vtgateProcess.TearDown()

// Create 2 connections to different tablets
conns := createSessionConnections(t, &vtParams, 2)
for conn := range conns {
defer conn.Close()
}

// Find the first replica tablet that one of our connections is using
var tabletToKill *cluster.Vttablet
var affectedConn *mysql.Conn
var killedServerID int64

for _, tablet := range replicaTablets {
tabletServerID := aliases[tablet.Alias]

// Check if any connection is using this tablet
for conn, connServerID := range conns {
if connServerID != tabletServerID {
continue
}

// We found a connection that's using this tablet, let's kill this tablet
tabletToKill = tablet
affectedConn = conn
killedServerID = tabletServerID
break
}

// We found a tablet, no need to check other tablets
if tabletToKill != nil {
break
}
}

require.NotNil(t, tabletToKill, "Should find a tablet to kill")

// Kill the tablet immediately
err := tabletToKill.VttabletProcess.Kill()
require.Error(t, err)

// Wait for the connection to switch to a new tablet and update the map
require.Eventually(t, func() bool {
newServerID := getServerID(t, affectedConn)
if newServerID != killedServerID {
conns[affectedConn] = newServerID
return true
}

return false
}, 10*time.Millisecond, 1*time.Millisecond, "Connection should switch to a different tablet")
Comment on lines +98 to +99
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't break the serializable guarantees of the new mode?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does. This test case mimics the case a tablet goes down/goes out of serving, ensuring that the connections move over accordingly. The alternative is to error until the tablet is back, but that is likely less than ideal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that we shouldn't return an error forever, we should only do so once. The client should know that their session — which provided serializable consistency — has been terminated and that they need to reconnect in order to get a new view with the same serializable property. We can't silently move their session to another replica that could be minutes behind the previous one IMO. Doesn't that defeat the purpose here, since you can never really count on the core property holding true and have to maintain checks and handling for going back in time?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we can ensure that the newly selected replica is at least at the same GTID snapshot/position that the old one was, then it would be fine (it's OK to go forward in time).

Or do you think I'm missing or misunderstanding something here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope you're not missing anything, those are all fair points. I suppose it comes down to how much of a guarantee we'd like to make to clients. In my head at least, I was envisioning it being a large step up from the normal selection logic of a random tablet in the local cell, but not quite a 100% guarantee of consistency across the board. Even if we do stick with the approach I've already taken, we'd definitely need to clarify in the docs that in the case of tablets being added or removed, there is a possibility to break the consistency guarantee.

Do you feel strongly about "going all the way" in terms of consistency? I'll give it some deeper thought on my end as well.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing that this feature came from your own experience? If so, what would you prefer?

As long as we clearly document it, I think it would be OK. It would also be something we could change later on.

Also may be worth bringing up with the community in the Vitess Slack to see what others think.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a desire for this behavior at GitHub from Arthur and I, but I wouldn't quite say there was an explicit need for this, more of a nice to have. I'd imagine we would have initially sacrificed the additional consistency for the ease of adoption/less disruption that the "automatic connection shifting" would've afforded us. I think this can be a start, with eyes on guaranteeing more consistency if the need arises in the future.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clarified the docs: vitessio/website@f207553

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At GitHub (and I'm sure at many other places), queries made outside of transactions are automatically retried a few times in case of a failure at the connection level.

This means that the application is actually unaware that a query failed, and due to the auto-reconnect that's happening, the consistency guarantee is already sacrificed for replica connections in these cases for better availability / transparent error handling.

A host going down for maintenance or unexpectedly becoming unavailable was a pretty rare occurrence when compared to the frequency of queries being executed.


verifyStickiness(t, conns, 20)
}

// setupCluster sets up a cluster with a vtgate using the session balancer.
func setupCluster(t *testing.T) (*cluster.VtgateProcess, mysql.ConnParams, []*cluster.Vttablet, map[string]int64) {
t.Helper()

// Start vtgate in cell1 with session mode
vtgateProcess := cluster.VtgateProcessInstance(
clusterInstance.GetAndReservePort(),
clusterInstance.GetAndReservePort(),
clusterInstance.GetAndReservePort(),
cell1,
fmt.Sprintf("%s,%s", cell1, cell2),
clusterInstance.Hostname,
replicaStr,
clusterInstance.TopoProcess.Port,
clusterInstance.TmpDirectory,
[]string{
"--vtgate-balancer-mode", "session",
},
plancontext.PlannerVersion(0),
)
require.NoError(t, vtgateProcess.Setup())
require.True(t, vtgateProcess.WaitForStatus())

vtParams := mysql.ConnParams{
Host: clusterInstance.Hostname,
Port: vtgateProcess.MySQLServerPort,
}

allTablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
shardName := clusterInstance.Keyspaces[0].Shards[0].Name
replicaTablets := replicaTablets(allTablets)

conn, err := mysql.Connect(context.Background(), &vtParams)
require.NoError(t, err)
defer conn.Close()

// Wait for tablets to be discovered
err = vtgateProcess.WaitForStatusOfTabletInShard(fmt.Sprintf("%s.%s.primary", keyspaceName, shardName), 1, 30*time.Second)
require.NoError(t, err)

err = vtgateProcess.WaitForStatusOfTabletInShard(fmt.Sprintf("%s.%s.replica", keyspaceName, shardName), len(replicaTablets), 30*time.Second)
require.NoError(t, err)

aliases := mapTabletAliasToMySQLServerID(t, allTablets)

// Insert test data
testValue := fmt.Sprintf("session_test_%d", time.Now().UnixNano())
_, err = conn.ExecuteFetch(fmt.Sprintf("INSERT INTO balancer_test (value) VALUES ('%s')", testValue), 1, false)
require.NoError(t, err)
waitForReplication(t, replicaTablets, testValue)

return vtgateProcess, vtParams, replicaTablets, aliases
}

// getServerID returns the server ID that the connection is currently routing to.
func getServerID(t *testing.T, conn *mysql.Conn) int64 {
t.Helper()

res, err := conn.ExecuteFetch("SELECT @@server_id", 1, false)
require.NoError(t, err)
require.Equal(t, 1, len(res.Rows), "expected one row from server_id query")

serverID, err := res.Rows[0][0].ToInt64()
require.NoError(t, err)

return serverID
}

// createSessionConnections creates `n` connections that route to different tablets.
// Returns a map of mysql.Conn -> serverID.
func createSessionConnections(t *testing.T, vtParams *mysql.ConnParams, numConnections int) map[*mysql.Conn]int64 {
t.Helper()

conns := make(map[*mysql.Conn]int64)
seenServerIDs := make(map[int64]bool)

// Try up to 50 times to get numConnections with different server IDs
for range 50 {
conn, err := mysql.Connect(context.Background(), vtParams)
require.NoError(t, err)

_, err = conn.ExecuteFetch("USE @replica", 1, false)
require.NoError(t, err)

// Get the server ID this connection routes to
serverID := getServerID(t, conn)

// If this is a new tablet, keep the connection
if !seenServerIDs[serverID] {
seenServerIDs[serverID] = true
conns[conn] = serverID

// If we have enough connections, return
if len(conns) == numConnections {
return conns
}

continue
}

// Already seen this tablet, close and try again
conn.Close()
}

t.Fatalf("could not create %d connections with different tablets after 50 attempts, only got %d", numConnections, len(conns))
return nil
}

// verifyStickiness validates whether the given connections remain connected to the same
// server `n` times in a row.
func verifyStickiness(t *testing.T, conns map[*mysql.Conn]int64, n uint) {
t.Helper()

for conn, expectedServerID := range conns {
for range n {
currentServerID := getServerID(t, conn)
require.Equal(t, expectedServerID, currentServerID, "Connection should stick to tablet %d, got %d", expectedServerID, currentServerID)
}
}
}
Loading
Loading