Skip to content

Commit 1e10f1f

Browse files
author
Shlomi Noach
committed
Solved various race conditions:
- Operation would terminate after events lock noticed but before applying all events: race condition where the event would be captured asynchronously. The event is now handled sequentially with the DML events, hence now safe. - Multiple rowcopy operations would still write to `rowCopyComplete` channel. This is still the case, but now we only wait for the first and then just flush (read and discard) any others, to avoid blocking - Events DML listener is only added after table creation: the problem was that with very busy tables, the events func buffer would fill up, and the "tables-created" event would be blocked. - `waitForEventsUpToLock()` unifies the waiting on all variants of complete-migration - With `--test-on-replica`, now stopping replication "nicely", using `master_pos_wait()` - With `--test-on-replica`, not throttling on replication after replication is stopped (duh) - More debug output
1 parent 134bf38 commit 1e10f1f

File tree

5 files changed

+155
-71
lines changed

5 files changed

+155
-71
lines changed

go/base/context.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,17 +121,17 @@ func GetMigrationContext() *MigrationContext {
121121

122122
// GetGhostTableName generates the name of ghost table, based on original table name
123123
func (this *MigrationContext) GetGhostTableName() string {
124-
return fmt.Sprintf("_%s_New", this.OriginalTableName)
124+
return fmt.Sprintf("_%s_gst", this.OriginalTableName)
125125
}
126126

127127
// GetOldTableName generates the name of the "old" table, into which the original table is renamed.
128128
func (this *MigrationContext) GetOldTableName() string {
129-
return fmt.Sprintf("_%s_Old", this.OriginalTableName)
129+
return fmt.Sprintf("_%s_old", this.OriginalTableName)
130130
}
131131

132132
// GetChangelogTableName generates the name of changelog table, based on original table name
133133
func (this *MigrationContext) GetChangelogTableName() string {
134-
return fmt.Sprintf("_%s_OSC", this.OriginalTableName)
134+
return fmt.Sprintf("_%s_osc", this.OriginalTableName)
135135
}
136136

137137
// GetVoluntaryLockName returns a name of a voluntary lock to be used throughout

go/logic/applier.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,39 @@ func (this *Applier) StopSlaveIOThread() error {
548548
return nil
549549
}
550550

551+
// MasterPosWait is applicable with --test-on-replica
552+
func (this *Applier) MasterPosWait(binlogCoordinates *mysql.BinlogCoordinates) error {
553+
var appliedRows int64
554+
if err := this.db.QueryRow(`select master_pos_wait(?, ?, ?)`, binlogCoordinates.LogFile, binlogCoordinates.LogPos, 1).Scan(&appliedRows); err != nil {
555+
return err
556+
}
557+
if appliedRows < 0 {
558+
return fmt.Errorf("Timeout waiting on master_pos_wait()")
559+
}
560+
return nil
561+
}
562+
563+
func (this *Applier) StopSlaveNicely() error {
564+
if err := this.StopSlaveIOThread(); err != nil {
565+
return err
566+
}
567+
binlogCoordinates, err := mysql.GetReadBinlogCoordinates(this.db)
568+
if err != nil {
569+
return err
570+
}
571+
log.Debugf("Replication stopped at %+v. Will wait for SQL thread to apply", *binlogCoordinates)
572+
if err := this.MasterPosWait(binlogCoordinates); err != nil {
573+
return err
574+
}
575+
log.Debugf("Replication SQL thread applied all events")
576+
if selfBinlogCoordinates, err := mysql.GetSelfBinlogCoordinates(this.db); err != nil {
577+
return err
578+
} else {
579+
log.Debugf("Self binlog coordinates: %+v", *selfBinlogCoordinates)
580+
}
581+
return nil
582+
}
583+
551584
// GrabVoluntaryLock gets a named lock (`GET_LOCK`) and listens
552585
// on a okToRelease in order to release it
553586
func (this *Applier) GrabVoluntaryLock(lockGrabbed chan<- error, okToRelease <-chan bool) error {

go/logic/migrator.go

Lines changed: 87 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ type Migrator struct {
4848
voluntaryLockAcquired chan bool
4949
panicAbort chan error
5050

51+
allEventsUpToLockProcessedFlag int64
5152
// copyRowsQueue should not be buffered; if buffered some non-damaging but
5253
// excessive work happens at the end of the iteration as new copy-jobs arrive befroe realizing the copy is complete
5354
copyRowsQueue chan tableWriteFunc
@@ -65,6 +66,8 @@ func NewMigrator() *Migrator {
6566
voluntaryLockAcquired: make(chan bool, 1),
6667
panicAbort: make(chan error),
6768

69+
allEventsUpToLockProcessedFlag: 0,
70+
6871
copyRowsQueue: make(chan tableWriteFunc),
6972
applyEventsQueue: make(chan tableWriteFunc, applyEventsQueueBuffer),
7073
handledChangelogStates: make(map[string]bool),
@@ -106,7 +109,7 @@ func (this *Migrator) shouldThrottle() (result bool, reason string) {
106109
if time.Duration(lag) > time.Duration(this.migrationContext.MaxLagMillisecondsThrottleThreshold)*time.Millisecond {
107110
return true, fmt.Sprintf("lag=%fs", time.Duration(lag).Seconds())
108111
}
109-
if this.migrationContext.TestOnReplica {
112+
if this.migrationContext.TestOnReplica && (atomic.LoadInt64(&this.allEventsUpToLockProcessedFlag) == 0) {
110113
replicationLag, err := mysql.GetMaxReplicationLag(this.migrationContext.InspectorConnectionConfig, this.migrationContext.ThrottleControlReplicaKeys, this.migrationContext.ReplictionLagQuery)
111114
if err != nil {
112115
return true, err.Error()
@@ -198,6 +201,16 @@ func (this *Migrator) executeAndThrottleOnError(operation func() error) (err err
198201
return nil
199202
}
200203

204+
// consumeRowCopyComplete blocks on the rowCopyComplete channel once, and then
205+
// consumers and drops any further incoming events that may be left hanging.
206+
func (this *Migrator) consumeRowCopyComplete() {
207+
<-this.rowCopyComplete
208+
go func() {
209+
for <-this.rowCopyComplete {
210+
}
211+
}()
212+
}
213+
201214
func (this *Migrator) canStopStreaming() bool {
202215
return false
203216
}
@@ -215,33 +228,18 @@ func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (er
215228
}
216229
case AllEventsUpToLockProcessed:
217230
{
218-
this.allEventsUpToLockProcessed <- true
219-
}
220-
default:
221-
{
222-
return fmt.Errorf("Unknown changelog state: %+v", changelogState)
223-
}
224-
}
225-
log.Debugf("Received state %+v", changelogState)
226-
return nil
227-
}
228-
229-
func (this *Migrator) onChangelogState(stateValue string) (err error) {
230-
log.Fatalf("I shouldn't be here")
231-
if this.handledChangelogStates[stateValue] {
232-
return nil
233-
}
234-
this.handledChangelogStates[stateValue] = true
235-
236-
changelogState := ChangelogState(stateValue)
237-
switch changelogState {
238-
case TablesInPlace:
239-
{
240-
this.tablesInPlace <- true
241-
}
242-
case AllEventsUpToLockProcessed:
243-
{
244-
this.allEventsUpToLockProcessed <- true
231+
applyEventFunc := func() error {
232+
this.allEventsUpToLockProcessed <- true
233+
return nil
234+
}
235+
// at this point we know all events up to lock have been read from the streamer,
236+
// because the streamer works sequentially. So those events are either already handled,
237+
// or have event functions in applyEventsQueue.
238+
// So as not to create a potential deadlock, we write this func to applyEventsQueue
239+
// asynchronously, understanding it doesn't really matter.
240+
go func() {
241+
this.applyEventsQueue <- applyEventFunc
242+
}()
245243
}
246244
default:
247245
{
@@ -295,6 +293,9 @@ func (this *Migrator) Migrate() (err error) {
295293
if err := this.inspector.InspectOriginalAndGhostTables(); err != nil {
296294
return err
297295
}
296+
if err := this.addDMLEventsListener(); err != nil {
297+
return err
298+
}
298299
go this.initiateHeartbeatListener()
299300

300301
if err := this.applier.ReadMigrationRangeValues(); err != nil {
@@ -307,7 +308,7 @@ func (this *Migrator) Migrate() (err error) {
307308
go this.initiateStatus()
308309

309310
log.Debugf("Operating until row copy is complete")
310-
<-this.rowCopyComplete
311+
this.consumeRowCopyComplete()
311312
log.Debugf("Row copy complete")
312313
this.printStatus()
313314

@@ -336,18 +337,20 @@ func (this *Migrator) stopWritesAndCompleteMigration() (err error) {
336337
if this.migrationContext.QuickAndBumpySwapTables {
337338
return this.stopWritesAndCompleteMigrationOnMasterQuickAndBumpy()
338339
}
339-
// Lock-based solution: we use low timeout and multiple attempts. But for
340-
// each failed attempt, we throttle until replication lag is back to normal
341-
if err := this.retryOperation(
342-
func() error {
343-
return this.executeAndThrottleOnError(this.stopWritesAndCompleteMigrationOnMasterViaLock)
344-
}); err != nil {
345-
return err
346-
}
347-
if err := this.dropOldTableIfRequired(); err != nil {
348-
return err
349-
}
350340

341+
{
342+
// Lock-based solution: we use low timeout and multiple attempts. But for
343+
// each failed attempt, we throttle until replication lag is back to normal
344+
if err := this.retryOperation(
345+
func() error {
346+
return this.executeAndThrottleOnError(this.stopWritesAndCompleteMigrationOnMasterViaLock)
347+
}); err != nil {
348+
return err
349+
}
350+
if err := this.dropOldTableIfRequired(); err != nil {
351+
return err
352+
}
353+
}
351354
return
352355
}
353356

@@ -364,6 +367,21 @@ func (this *Migrator) dropOldTableIfRequired() (err error) {
364367
return nil
365368
}
366369

370+
// Inject the "AllEventsUpToLockProcessed" state hint, wait for it to appear in the binary logs,
371+
// make sure the queue is drained.
372+
func (this *Migrator) waitForEventsUpToLock() (err error) {
373+
if _, err := this.applier.WriteChangelogState(string(AllEventsUpToLockProcessed)); err != nil {
374+
return err
375+
}
376+
log.Debugf("Waiting for events up to lock")
377+
<-this.allEventsUpToLockProcessed
378+
atomic.StoreInt64(&this.allEventsUpToLockProcessedFlag, 1)
379+
log.Debugf("Done waiting for events up to lock")
380+
this.printStatus()
381+
382+
return nil
383+
}
384+
367385
// stopWritesAndCompleteMigrationOnMasterQuickAndBumpy will lock down the original table, execute
368386
// what's left of last DML entries, and **non-atomically** swap original->old, then new->original.
369387
// There is a point in time where the "original" table does not exist and queries are non-blocked
@@ -373,11 +391,9 @@ func (this *Migrator) stopWritesAndCompleteMigrationOnMasterQuickAndBumpy() (err
373391
return err
374392
}
375393

376-
this.applier.WriteChangelogState(string(AllEventsUpToLockProcessed))
377-
log.Debugf("Waiting for events up to lock")
378-
<-this.allEventsUpToLockProcessed
379-
log.Debugf("Done waiting for events up to lock")
380-
394+
if err := this.retryOperation(this.waitForEventsUpToLock); err != nil {
395+
return err
396+
}
381397
if err := this.retryOperation(this.applier.SwapTablesQuickAndBumpy); err != nil {
382398
return err
383399
}
@@ -438,10 +454,7 @@ func (this *Migrator) stopWritesAndCompleteMigrationOnMasterViaLock() (err error
438454
log.Infof("Found RENAME to be executing")
439455

440456
// OK, at this time we know any newly incoming DML on original table is blocked.
441-
this.applier.WriteChangelogState(string(AllEventsUpToLockProcessed))
442-
log.Debugf("Waiting for events up to lock")
443-
<-this.allEventsUpToLockProcessed
444-
log.Debugf("Done waiting for events up to lock")
457+
this.waitForEventsUpToLock()
445458

446459
okToReleaseLock <- true
447460
// BAM: voluntary lock is released, blocking query is released, rename is released.
@@ -466,14 +479,11 @@ func (this *Migrator) stopWritesAndCompleteMigrationOnMasterViaLock() (err error
466479
// in sync. There is no table swap.
467480
func (this *Migrator) stopWritesAndCompleteMigrationOnReplica() (err error) {
468481
log.Debugf("testing on replica. Instead of LOCK tables I will STOP SLAVE")
469-
if err := this.retryOperation(this.applier.StopSlaveIOThread); err != nil {
482+
if err := this.retryOperation(this.applier.StopSlaveNicely); err != nil {
470483
return err
471484
}
472485

473-
this.applier.WriteChangelogState(string(AllEventsUpToLockProcessed))
474-
log.Debugf("Waiting for events up to lock")
475-
<-this.allEventsUpToLockProcessed
476-
log.Debugf("Done waiting for events up to lock")
486+
this.waitForEventsUpToLock()
477487

478488
log.Info("Table duplicated with new schema. Am not touching the original table. Replication is stopped. You may now compare the two tables to gain trust into this tool's operation")
479489
return nil
@@ -612,7 +622,17 @@ func (this *Migrator) initiateStreaming() error {
612622
return this.onChangelogStateEvent(dmlEvent)
613623
},
614624
)
615-
this.eventsStreamer.AddListener(
625+
626+
go func() {
627+
log.Debugf("Beginning streaming")
628+
this.eventsStreamer.StreamEvents(func() bool { return this.canStopStreaming() })
629+
}()
630+
return nil
631+
}
632+
633+
// addDMLEventsListener
634+
func (this *Migrator) addDMLEventsListener() error {
635+
err := this.eventsStreamer.AddListener(
616636
false,
617637
this.migrationContext.DatabaseName,
618638
this.migrationContext.OriginalTableName,
@@ -624,12 +644,7 @@ func (this *Migrator) initiateStreaming() error {
624644
return nil
625645
},
626646
)
627-
628-
go func() {
629-
log.Debugf("Beginning streaming")
630-
this.eventsStreamer.StreamEvents(func() bool { return this.canStopStreaming() })
631-
}()
632-
return nil
647+
return err
633648
}
634649

635650
func (this *Migrator) initiateApplier() error {
@@ -680,13 +695,16 @@ func (this *Migrator) iterateChunks() error {
680695
if !hasFurtherRange {
681696
return terminateRowIteration(nil)
682697
}
683-
_, rowsAffected, _, err := this.applier.ApplyIterationInsertQuery()
684-
if err != nil {
685-
return terminateRowIteration(err)
698+
applyCopyRowsFunc := func() error {
699+
_, rowsAffected, _, err := this.applier.ApplyIterationInsertQuery()
700+
if err != nil {
701+
return terminateRowIteration(err)
702+
}
703+
atomic.AddInt64(&this.migrationContext.TotalRowsCopied, rowsAffected)
704+
atomic.AddInt64(&this.migrationContext.Iteration, 1)
705+
return nil
686706
}
687-
atomic.AddInt64(&this.migrationContext.TotalRowsCopied, rowsAffected)
688-
atomic.AddInt64(&this.migrationContext.Iteration, 1)
689-
return nil
707+
return this.retryOperation(applyCopyRowsFunc)
690708
}
691709
this.copyRowsQueue <- copyRowsFunc
692710
}
@@ -714,7 +732,8 @@ func (this *Migrator) executeWriteFuncs() error {
714732
select {
715733
case copyRowsFunc := <-this.copyRowsQueue:
716734
{
717-
if err := this.retryOperation(copyRowsFunc); err != nil {
735+
// Retries are handled within the copyRowsFunc
736+
if err := copyRowsFunc(); err != nil {
718737
return log.Errore(err)
719738
}
720739
}

go/logic/streamer.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
gosql "database/sql"
1010
"fmt"
1111
"strings"
12+
"sync"
1213

1314
"github.com/github/gh-osc/go/base"
1415
"github.com/github/gh-osc/go/binlog"
@@ -37,6 +38,7 @@ type EventsStreamer struct {
3738
migrationContext *base.MigrationContext
3839
nextBinlogCoordinates *mysql.BinlogCoordinates
3940
listeners [](*BinlogEventListener)
41+
listenersMutex *sync.Mutex
4042
eventsChannel chan *binlog.BinlogEntry
4143
binlogReader binlog.BinlogReader
4244
}
@@ -46,12 +48,17 @@ func NewEventsStreamer() *EventsStreamer {
4648
connectionConfig: base.GetMigrationContext().InspectorConnectionConfig,
4749
migrationContext: base.GetMigrationContext(),
4850
listeners: [](*BinlogEventListener){},
51+
listenersMutex: &sync.Mutex{},
4952
eventsChannel: make(chan *binlog.BinlogEntry, EventsChannelBufferSize),
5053
}
5154
}
5255

5356
func (this *EventsStreamer) AddListener(
5457
async bool, databaseName string, tableName string, onDmlEvent func(event *binlog.BinlogDMLEvent) error) (err error) {
58+
59+
this.listenersMutex.Lock()
60+
defer this.listenersMutex.Unlock()
61+
5562
if databaseName == "" {
5663
return fmt.Errorf("Empty database name in AddListener")
5764
}
@@ -69,6 +76,9 @@ func (this *EventsStreamer) AddListener(
6976
}
7077

7178
func (this *EventsStreamer) notifyListeners(binlogEvent *binlog.BinlogDMLEvent) {
79+
this.listenersMutex.Lock()
80+
defer this.listenersMutex.Unlock()
81+
7282
for _, listener := range this.listeners {
7383
if strings.ToLower(listener.databaseName) != strings.ToLower(binlogEvent.DatabaseName) {
7484
continue

0 commit comments

Comments
 (0)