Skip to content

Commit 8a81b91

Browse files
committed
optimize migration; clean migrated data
1 parent 325ec10 commit 8a81b91

File tree

3 files changed

+118
-74
lines changed

3 files changed

+118
-74
lines changed

cluster/core/core.go

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
package core
22

33
import (
4-
"sync"
54

65
"github.com/hdt3213/godis/cluster/raft"
76
dbimpl "github.com/hdt3213/godis/database"
8-
"github.com/hdt3213/godis/datastruct/set"
97
"github.com/hdt3213/godis/interface/database"
108
"github.com/hdt3213/godis/interface/redis"
119
"github.com/hdt3213/godis/lib/logger"
@@ -49,57 +47,6 @@ func (c *Cluster) SelfID() string {
4947
return c.raftNode.Cfg.ID()
5048
}
5149

52-
// slotsManager 负责管理当前 node 上的 slot
53-
type slotsManager struct {
54-
mu *sync.RWMutex
55-
slots map[uint32]*slotStatus // 记录当前node上的 slot
56-
importingTask *raft.MigratingTask
57-
}
58-
59-
const (
60-
slotStateHosting = iota
61-
slotStateImporting
62-
slotStateExporting
63-
)
64-
65-
type slotStatus struct {
66-
mu *sync.RWMutex
67-
state int
68-
keys *set.Set // 记录当前 slot 上的 key
69-
70-
exportSnapshot *set.Set // 开始传输时拷贝 slot 中的 key, 避免并发并发
71-
dirtyKeys *set.Set // 传输开始后被修改的key, 在传输结束阶段需要重传一遍
72-
}
73-
74-
func newSlotsManager() *slotsManager {
75-
return &slotsManager{
76-
mu: &sync.RWMutex{},
77-
slots: map[uint32]*slotStatus{},
78-
}
79-
}
80-
81-
func (ssm *slotsManager) getSlot(index uint32) *slotStatus {
82-
ssm.mu.RLock()
83-
slot := ssm.slots[index]
84-
ssm.mu.RUnlock()
85-
if slot != nil {
86-
return slot
87-
}
88-
ssm.mu.Lock()
89-
defer ssm.mu.Unlock()
90-
// check-lock-check
91-
slot = ssm.slots[index]
92-
if slot != nil {
93-
return slot
94-
}
95-
slot = &slotStatus{
96-
state: slotStateHosting,
97-
keys: set.Make(),
98-
mu: &sync.RWMutex{},
99-
}
100-
ssm.slots[index] = slot
101-
return slot
102-
}
10350

10451
func NewCluster(cfg *Config) (*Cluster, error) {
10552
var connections ConnectionFactory

cluster/core/migration.go

Lines changed: 94 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package core
22

33
import (
44
"fmt"
5-
"strconv"
5+
"sync"
66
"time"
77

88
"github.com/hdt3213/godis/aof"
@@ -26,6 +26,58 @@ func init() {
2626
RegisterCmd(startMigrationCommand, execStartMigration)
2727
}
2828

29+
// slotsManager 负责管理当前 node 上的 slot
30+
type slotsManager struct {
31+
mu *sync.RWMutex
32+
slots map[uint32]*slotStatus // 记录当前node上的 slot
33+
importingTask *raft.MigratingTask
34+
}
35+
36+
const (
37+
slotStateHosting = iota
38+
slotStateImporting
39+
slotStateExporting
40+
)
41+
42+
type slotStatus struct {
43+
mu *sync.RWMutex
44+
state int
45+
keys *set.Set // 记录当前 slot 上的 key
46+
47+
exportSnapshot *set.Set // 开始传输时拷贝 slot 中的 key, 避免并发
48+
dirtyKeys *set.Set // 传输开始后被修改的key, 在传输结束阶段需要重传一遍
49+
}
50+
51+
func newSlotsManager() *slotsManager {
52+
return &slotsManager{
53+
mu: &sync.RWMutex{},
54+
slots: map[uint32]*slotStatus{},
55+
}
56+
}
57+
58+
func (ssm *slotsManager) getSlot(index uint32) *slotStatus {
59+
ssm.mu.RLock()
60+
slot := ssm.slots[index]
61+
ssm.mu.RUnlock()
62+
if slot != nil {
63+
return slot
64+
}
65+
ssm.mu.Lock()
66+
defer ssm.mu.Unlock()
67+
// check-lock-check
68+
slot = ssm.slots[index]
69+
if slot != nil {
70+
return slot
71+
}
72+
slot = &slotStatus{
73+
state: slotStateHosting,
74+
keys: set.Make(),
75+
mu: &sync.RWMutex{},
76+
}
77+
ssm.slots[index] = slot
78+
return slot
79+
}
80+
2981
func (sm *slotStatus) startExporting() protocol.ErrorReply {
3082
sm.mu.Lock()
3183
defer sm.mu.Unlock()
@@ -44,6 +96,25 @@ func (sm *slotStatus) finishExportingWithinLock() {
4496
sm.exportSnapshot = nil
4597
}
4698

99+
func (cluster *Cluster) dropSlot(index uint32) {
100+
cluster.slotsManager.mu.RLock()
101+
slot := cluster.slotsManager.slots[index]
102+
cluster.slotsManager.mu.RUnlock()
103+
if slot == nil {
104+
return
105+
}
106+
slot.mu.Lock()
107+
defer slot.mu.Unlock()
108+
c := connection.NewFakeConn()
109+
slot.keys.ForEach(func(key string) bool {
110+
cluster.LocalExec(c, utils.ToCmdLine("del", key))
111+
return true
112+
})
113+
cluster.slotsManager.mu.Lock()
114+
delete(cluster.slotsManager.slots, index)
115+
cluster.slotsManager.mu.Unlock()
116+
}
117+
47118
func (cluster *Cluster) injectInsertCallback() {
48119
cb := func(dbIndex int, key string, entity *database.DataEntity) {
49120
slotIndex := cluster.GetSlot(key)
@@ -187,32 +258,40 @@ func execFinishExport(cluster *Cluster, c redis.Connection, cmdLine CmdLine) red
187258
}
188259
logger.Infof("finishing migration task %s, route changed", taskId)
189260

261+
// clean migrated slots
262+
go func() {
263+
defer func() {
264+
if e := recover(); e != nil {
265+
logger.Errorf("panic %v", e)
266+
}
267+
}()
268+
for _, index := range task.Slots {
269+
cluster.dropSlot(index)
270+
}
271+
}()
190272
c.Write(protocol.MakeOkReply().ToBytes())
191273
return &protocol.NoReply{}
192274
}
193275

194276
// execStartMigration receives startMigrationCommand from leader and start migration job at background
195-
// command line: startMigrationCommand taskId srcNode slotId1 [slotId2]...
277+
// command line: startMigrationCommand taskId
196278
func execStartMigration(cluster *Cluster, c redis.Connection, cmdLine CmdLine) redis.Reply {
197-
if len(cmdLine) < 4 {
279+
if len(cmdLine) != 2 {
198280
return protocol.MakeArgNumErrReply(startMigrationCommand)
199281
}
200282
taskId := string(cmdLine[1])
201-
srcNode := string(cmdLine[2])
202-
var slotIds []uint32
203-
for _, slotIdStr := range cmdLine[3:] {
204-
slotId, err := strconv.Atoi(string(slotIdStr))
205-
if err != nil {
206-
return protocol.MakeErrReply("illegal slot id: " + string(slotIdStr))
283+
284+
var task *raft.MigratingTask
285+
for i := 0; i < 50; i++ {
286+
task = cluster.raftNode.FSM.GetMigratingTask(taskId)
287+
if task == nil {
288+
time.Sleep(time.Millisecond * 100)
207289
}
208-
slotIds = append(slotIds, uint32(slotId))
209290
}
210-
task := &raft.MigratingTask{
211-
ID: taskId,
212-
SrcNode: srcNode,
213-
TargetNode: cluster.SelfID(),
214-
Slots: slotIds,
291+
if task == nil {
292+
return protocol.MakeErrReply("ERR get migrating task timeout")
215293
}
294+
216295
cluster.slotsManager.mu.Lock()
217296
cluster.slotsManager.importingTask = task
218297
cluster.slotsManager.mu.Unlock()

cluster/core/node_manager.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"errors"
55
"fmt"
66
"math"
7-
"strconv"
87
"sync"
98
"time"
109

@@ -15,6 +14,29 @@ import (
1514
"github.com/hdt3213/godis/redis/protocol"
1615
)
1716

17+
/*
18+
19+
**Rebalance Procedure**
20+
1. Invoke `triggerMigrationTask` on cluster Leader to start a migration task
21+
2. Leader propose EventStartMigrate to raft group then send startMigrationCommand to TargetNode. (at `triggerMigrationTask`)
22+
23+
3. Target Node runs `doImports` after receiving startMigrationCommand
24+
4. Target Node send exportCommand to Source Node.
25+
26+
6. Source Node get migrating task from raft (at `execExport`)
27+
7. SourceNode set task into slotManager to start recording dirty keys during migration. (at `injectInsertCallback`)
28+
8. Source Node dump old data to Target Node
29+
30+
9. Target node send migrationDoneCommand to Source Node. (at `doImports`)
31+
10. Source Node runs `execFinishExport`, lock slots to stop writing
32+
11. Source Node send dirty keys to Target Node
33+
34+
12. Source Node send migrationChangeRouteCommand to Leader
35+
13. Leader porposes EventFinishMigrate to raft and waits Source Node and Target Node receives this entry(at `execMigrationChangeRoute`)
36+
14. Source Node finish exporting, unlock slots, clean data
37+
15. Target Node finish importing, unlock slots, start serve
38+
*/
39+
1840
const joinClusterCommand = "cluster.join"
1941
const migrationChangeRouteCommand = "cluster.migration.changeroute"
2042

@@ -113,11 +135,7 @@ func (cluster *Cluster) triggerMigrationTask(task *raft.MigratingTask) error {
113135
}
114136
logger.Infof("propose EventStartMigrate %s success", task.ID)
115137

116-
cmdLine := utils.ToCmdLine(startMigrationCommand, task.ID, task.SrcNode)
117-
for _, slotId := range task.Slots {
118-
slotIdStr := strconv.Itoa(int(slotId))
119-
cmdLine = append(cmdLine, []byte(slotIdStr))
120-
}
138+
cmdLine := utils.ToCmdLine(startMigrationCommand, task.ID)
121139
targetNodeConn, err := cluster.connections.BorrowPeerClient(task.TargetNode)
122140
if err != nil {
123141
return err

0 commit comments

Comments
 (0)