Skip to content

Commit cb1c61a

Browse files
author
Shlomi Noach
committed
- --cut-over no longer mandatory; default to safe
- Removed `CutOverVoluntaryLock` and associated code - Removed `CutOverUdfWait` - `RenameTablesRollback()` first attempts an atomic swap
2 parents 8292f56 + 302dbf0 commit cb1c61a

File tree

4 files changed

+45
-86
lines changed

4 files changed

+45
-86
lines changed

build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
#
33
#
4-
RELEASE_VERSION="0.8.8"
4+
RELEASE_VERSION="0.8.9"
55

66
buildpath=/tmp/gh-ost
77
target=gh-ost

go/base/context.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ const (
3131
type CutOver int
3232

3333
const (
34-
CutOverTwoStep CutOver = 1
35-
CutOverVoluntaryLock
36-
CutOverUdfWait
34+
CutOverTwoStep CutOver = iota
35+
CutOverVoluntaryLock = iota
36+
CutOverUdfWait = iota
3737
)
3838

3939
const (

go/logic/applier.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,31 @@ func (this *Applier) SwapTablesAtomic(sessionIdChan chan int64) error {
546546
return nil
547547
}
548548

549+
func (this *Applier) RenameTablesRollback() (renameError error) {
550+
551+
query := fmt.Sprintf(`rename /* gh-ost */ table %s.%s to %s.%s`,
552+
sql.EscapeName(this.migrationContext.DatabaseName),
553+
sql.EscapeName(this.migrationContext.OriginalTableName),
554+
sql.EscapeName(this.migrationContext.DatabaseName),
555+
sql.EscapeName(this.migrationContext.GetGhostTableName()),
556+
)
557+
log.Infof("Renaming back to ghost table")
558+
if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil {
559+
renameError = err
560+
}
561+
query = fmt.Sprintf(`rename /* gh-ost */ table %s.%s to %s.%s`,
562+
sql.EscapeName(this.migrationContext.DatabaseName),
563+
sql.EscapeName(this.migrationContext.GetOldTableName()),
564+
sql.EscapeName(this.migrationContext.DatabaseName),
565+
sql.EscapeName(this.migrationContext.OriginalTableName),
566+
)
567+
log.Infof("Renaming back to original table")
568+
if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil {
569+
renameError = err
570+
}
571+
return log.Errore(renameError)
572+
}
573+
549574
// StopSlaveIOThread is applicable with --test-on-replica; it stops the IO thread, duh.
550575
// We need to keep the SQL thread active so as to complete processing received events,
551576
// and have them written to the binary log, so that we can then read them via streamer

go/logic/migrator.go

Lines changed: 16 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -389,11 +389,18 @@ func (this *Migrator) stopWritesAndCompleteMigration() (err error) {
389389
atomic.StoreInt64(&this.migrationContext.IsPostponingCutOver, 0)
390390

391391
if this.migrationContext.TestOnReplica {
392-
return this.stopWritesAndCompleteMigrationOnReplica()
392+
// With `--test-on-replica` we stop replication thread, and then proceed to use
393+
// the same cut-over phase as the master would use. That means we take locks
394+
// and swap the tables.
395+
// The difference is that we will later swap the tables back.
396+
log.Debugf("testing on replica. Stopping replication IO thread")
397+
if err := this.retryOperation(this.applier.StopSlaveNicely); err != nil {
398+
return err
399+
}
400+
// We're merly testing, we don't want to keep this state. Rollback the renames as possible
401+
defer this.applier.RenameTablesRollback()
393402
}
394-
// Running on master
395-
396-
{
403+
if this.migrationContext.CutOverType == base.CutOverSafe {
397404
// Lock-based solution: we use low timeout and multiple attempts. But for
398405
// each failed attempt, we throttle until replication lag is back to normal
399406
err := this.retryOperation(
@@ -404,20 +411,10 @@ func (this *Migrator) stopWritesAndCompleteMigration() (err error) {
404411
return err
405412
}
406413
if this.migrationContext.CutOverType == base.CutOverTwoStep {
407-
return this.stopWritesAndCompleteMigrationOnMasterQuickAndBumpy()
408-
}
409-
410-
{
411-
// Lock-based solution: we use low timeout and multiple attempts. But for
412-
// each failed attempt, we throttle until replication lag is back to normal
413-
if err := this.retryOperation(
414-
func() error {
415-
return this.executeAndThrottleOnError(this.stopWritesAndCompleteMigrationOnMasterViaLock)
416-
}); err != nil {
417-
return err
418-
}
414+
err := this.retryOperation(this.cutOverTwoStep)
415+
return err
419416
}
420-
return
417+
return nil
421418
}
422419

423420
// Inject the "AllEventsUpToLockProcessed" state hint, wait for it to appear in the binary logs,
@@ -440,11 +437,11 @@ func (this *Migrator) waitForEventsUpToLock() (err error) {
440437
return nil
441438
}
442439

443-
// stopWritesAndCompleteMigrationOnMasterQuickAndBumpy will lock down the original table, execute
440+
// cutOverTwoStep will lock down the original table, execute
444441
// what's left of last DML entries, and **non-atomically** swap original->old, then new->original.
445442
// There is a point in time where the "original" table does not exist and queries are non-blocked
446443
// and failing.
447-
func (this *Migrator) stopWritesAndCompleteMigrationOnMasterQuickAndBumpy() (err error) {
444+
func (this *Migrator) cutOverTwoStep() (err error) {
448445
if err := this.retryOperation(this.applier.LockOriginalTable); err != nil {
449446
return err
450447
}
@@ -465,69 +462,6 @@ func (this *Migrator) stopWritesAndCompleteMigrationOnMasterQuickAndBumpy() (err
465462
return nil
466463
}
467464

468-
// stopWritesAndCompleteMigrationOnMasterViaLock will lock down the original table, execute
469-
// what's left of last DML entries, and atomically swap & unlock (original->old && new->original)
470-
func (this *Migrator) stopWritesAndCompleteMigrationOnMasterViaLock() (err error) {
471-
lockGrabbed := make(chan error, 1)
472-
okToReleaseLock := make(chan bool, 1)
473-
swapResult := make(chan error, 1)
474-
go func() {
475-
if err := this.applier.GrabVoluntaryLock(lockGrabbed, okToReleaseLock); err != nil {
476-
log.Errore(err)
477-
}
478-
}()
479-
if err := <-lockGrabbed; err != nil {
480-
return log.Errore(err)
481-
}
482-
blockingQuerySessionIdChan := make(chan int64, 1)
483-
go func() {
484-
this.applier.IssueBlockingQueryOnVoluntaryLock(blockingQuerySessionIdChan)
485-
}()
486-
blockingQuerySessionId := <-blockingQuerySessionIdChan
487-
log.Infof("Intentional blocking query connection id is %+v", blockingQuerySessionId)
488-
489-
if err := this.retryOperation(
490-
func() error {
491-
return this.applier.ExpectProcess(blockingQuerySessionId, "User lock", this.migrationContext.GetVoluntaryLockName())
492-
}); err != nil {
493-
return err
494-
}
495-
log.Infof("Found blocking query to be executing")
496-
swapSessionIdChan := make(chan int64, 1)
497-
go func() {
498-
swapResult <- this.applier.SwapTablesAtomic(swapSessionIdChan)
499-
}()
500-
501-
swapSessionId := <-swapSessionIdChan
502-
log.Infof("RENAME connection id is %+v", swapSessionId)
503-
if err := this.retryOperation(
504-
func() error {
505-
return this.applier.ExpectProcess(swapSessionId, "metadata lock", "rename")
506-
}); err != nil {
507-
return err
508-
}
509-
log.Infof("Found RENAME to be executing")
510-
511-
// OK, at this time we know any newly incoming DML on original table is blocked.
512-
this.waitForEventsUpToLock()
513-
514-
okToReleaseLock <- true
515-
// BAM: voluntary lock is released, blocking query is released, rename is released.
516-
// We now check RENAME result. We have lock_wait_timeout. We put it on purpose, to avoid
517-
// locking the tables for too long. If lock time exceeds said timeout, the RENAME fails
518-
// and returns a non-nil error, in which case tables have not been swapped, and we are
519-
// not really done. We are, however, good to go for more retries.
520-
if err := <-swapResult; err != nil {
521-
// Bummer. We shall rest a while and try again
522-
return err
523-
}
524-
// ooh nice! We're actually truly and thankfully done
525-
lockAndRenameDuration := this.migrationContext.RenameTablesEndTime.Sub(this.migrationContext.LockTablesStartTime)
526-
renameDuration := this.migrationContext.RenameTablesEndTime.Sub(this.migrationContext.RenameTablesStartTime)
527-
log.Debugf("Lock & rename duration: %s. Of this, rename time was %s. During rename time, queries on %s were blocked", lockAndRenameDuration, renameDuration, sql.EscapeName(this.migrationContext.OriginalTableName))
528-
return nil
529-
}
530-
531465
// cutOverSafe performs a safe cut over, where normally (no failure) the original table
532466
// is being locked until swapped, hence DML queries being locked and unaware of the cut-over.
533467
// In the worst case, there will ba a minor outage, where the original table would not exist.

0 commit comments

Comments
 (0)