@@ -7,49 +7,66 @@ import (
77 "context"
88 "crypto/tls"
99 "encoding/binary"
10- "time"
10+ "sync"
11+ "sync/atomic"
1112
12- "github.com/pingcap/tiproxy/lib/util/errors "
13+ glist "github.com/bahlo/generic-list-go "
1314 "github.com/pingcap/tiproxy/pkg/manager/id"
1415 "github.com/pingcap/tiproxy/pkg/proxy/backend"
1516 pnet "github.com/pingcap/tiproxy/pkg/proxy/net"
1617 "github.com/pingcap/tiproxy/pkg/sqlreplay/cmd"
1718 "go.uber.org/zap"
1819)
1920
20- const (
21- maxPendingCommands = 100 // pending commands for each connection
22- )
21+ // ReplayStats record the statistics during replay. All connections share one ReplayStats and update it concurrently.
22+ type ReplayStats struct {
23+ // ReplayedCmds is the number of executed commands.
24+ ReplayedCmds atomic.Uint64
25+ // PendingCmds is the number of decoded but not executed commands.
26+ PendingCmds atomic.Int64
27+ }
28+
29+ func (s * ReplayStats ) Reset () {
30+ s .ReplayedCmds .Store (0 )
31+ s .PendingCmds .Store (0 )
32+ }
2333
2434type Conn interface {
2535 Run (ctx context.Context )
2636 ExecuteCmd (command * cmd.Command )
37+ Stop ()
2738}
2839
2940type ConnCreator func (connID uint64 ) Conn
3041
3142var _ Conn = (* conn )(nil )
3243
3344type conn struct {
34- cmdCh chan * cmd.Command
35- exceptionCh chan <- Exception
36- closeCh chan <- uint64
37- lg * zap.Logger
38- backendConn BackendConn
39- connID uint64 // capture ID, not replay ID
45+ cmdLock sync.Mutex
46+ cmdCh chan struct {}
47+ cmdList * glist.List [* cmd.Command ]
48+ exceptionCh chan <- Exception
49+ closeCh chan <- uint64
50+ lg * zap.Logger
51+ backendConn BackendConn
52+ connID uint64 // capture ID, not replay ID
53+ replayStats * ReplayStats
54+ lastPendingCmds int // last pending cmds reported to the stats
4055}
4156
4257func NewConn (lg * zap.Logger , username , password string , backendTLSConfig * tls.Config , hsHandler backend.HandshakeHandler ,
43- idMgr * id.IDManager , connID uint64 , bcConfig * backend.BCConfig , exceptionCh chan <- Exception , closeCh chan <- uint64 ) * conn {
58+ idMgr * id.IDManager , connID uint64 , bcConfig * backend.BCConfig , exceptionCh chan <- Exception , closeCh chan <- uint64 , replayStats * ReplayStats ) * conn {
4459 backendConnID := idMgr .NewID ()
4560 lg = lg .With (zap .Uint64 ("captureID" , connID ), zap .Uint64 ("replayID" , backendConnID ))
4661 return & conn {
4762 lg : lg ,
4863 connID : connID ,
49- cmdCh : make (chan * cmd.Command , maxPendingCommands ),
64+ cmdList : glist .New [* cmd.Command ](),
65+ cmdCh : make (chan struct {}, 1 ),
5066 exceptionCh : exceptionCh ,
5167 closeCh : closeCh ,
5268 backendConn : NewBackendConn (lg .Named ("be" ), backendConnID , hsHandler , bcConfig , backendTLSConfig , username , password ),
69+ replayStats : replayStats ,
5370 }
5471}
5572
@@ -59,22 +76,42 @@ func (c *conn) Run(ctx context.Context) {
5976 c .exceptionCh <- NewOtherException (err , c .connID )
6077 return
6178 }
62- for {
79+ // context is canceled when the replay is interrupted.
80+ // cmdCh is closed when the replay is finished.
81+ finished := false
82+ for ! finished {
6383 select {
6484 case <- ctx .Done ():
65- // ctx is canceled when the replay is finished
6685 return
67- case command := <- c .cmdCh :
68- if err := c .backendConn .ExecuteCmd (ctx , command .Payload ); err != nil {
86+ case _ , ok := <- c .cmdCh :
87+ if ! ok {
88+ finished = true
89+ }
90+ }
91+ for ctx .Err () == nil {
92+ c .cmdLock .Lock ()
93+ pendingCmds := c .cmdList .Len ()
94+ command := c .cmdList .Back ()
95+ if command != nil {
96+ c .cmdList .Remove (command )
97+ }
98+ c .updatePendingCmds (pendingCmds )
99+ c .cmdLock .Unlock ()
100+ if command == nil {
101+ break
102+ }
103+ if err := c .backendConn .ExecuteCmd (ctx , command .Value .Payload ); err != nil {
69104 if pnet .IsDisconnectError (err ) {
70105 c .exceptionCh <- NewOtherException (err , c .connID )
106+ c .lg .Debug ("backend connection disconnected" , zap .Error (err ))
71107 return
72108 }
73- if c .updateCmdForExecuteStmt (command ) {
74- c .exceptionCh <- NewFailException (err , command )
109+ if c .updateCmdForExecuteStmt (command . Value ) {
110+ c .exceptionCh <- NewFailException (err , command . Value )
75111 }
76112 }
77- if command .Type == pnet .ComQuit {
113+ c .replayStats .ReplayedCmds .Add (1 )
114+ if command .Value .Type == pnet .ComQuit {
78115 return
79116 }
80117 }
@@ -99,23 +136,41 @@ func (c *conn) updateCmdForExecuteStmt(command *cmd.Command) bool {
99136 return true
100137}
101138
102- // ExecuteCmd executes a command asynchronously.
139+ // ExecuteCmd executes a command asynchronously by adding it to the list.
140+ // Adding commands should never block because it may cause cycle wait, so we don't use channels.
141+ // Conn A: wait for the lock held by conn B, and then its list becomes full and blocks the replay
142+ // Conn B: wait for next command, but the replay is blocked, so the lock won't be released
103143func (c * conn ) ExecuteCmd (command * cmd.Command ) {
144+ c .cmdLock .Lock ()
145+ c .cmdList .PushFront (command )
146+ pendingCmds := c .cmdList .Len ()
147+ c .updatePendingCmds (pendingCmds )
148+ c .cmdLock .Unlock ()
104149 select {
105- case c .cmdCh <- command :
106- case <- time .After (3 * time .Second ):
107- // If the replay is slower, wait until it catches up, otherwise too many transactions are broken.
108- // But if it's blocked due to a bug, discard the command to avoid block the whole replay.
109- // If the discarded command is a COMMIT, let the next COMMIT finish the transaction.
110- select {
111- case c .exceptionCh <- NewOtherException (errors .New ("too many pending commands, discard command" ), c .connID ):
112- default :
113- c .lg .Warn ("too many pending errors, discard error" )
114- }
150+ case c .cmdCh <- struct {}{}:
151+ default :
152+ }
153+ }
154+
155+ func (c * conn ) Stop () {
156+ close (c .cmdCh )
157+ }
158+
159+ func (c * conn ) updatePendingCmds (pendingCmds int ) {
160+ diff := pendingCmds - c .lastPendingCmds
161+ c .lastPendingCmds = pendingCmds
162+ if diff != 0 {
163+ c .replayStats .PendingCmds .Add (int64 (diff ))
115164 }
116165}
117166
118167func (c * conn ) close () {
168+ c .cmdLock .Lock ()
169+ if c .cmdList .Len () > 0 {
170+ c .lg .Debug ("backend connection closed while there are still pending commands" , zap .Int ("pending_cmds" , c .cmdList .Len ()))
171+ }
172+ c .updatePendingCmds (0 )
173+ c .cmdLock .Unlock ()
119174 c .backendConn .Close ()
120175 c .closeCh <- c .connID
121176}
0 commit comments