Skip to content

Commit a415a51

Browse files
authored
Merge pull request #43 from supermeng/master
Optimized for timeout
2 parents 58c52f2 + d3d2da9 commit a415a51

File tree

9 files changed

+64
-57
lines changed

9 files changed

+64
-57
lines changed

apiserver/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ func initOrcEngine(swarmAddr string, etcdAddr string, isDebug bool) (*engine.Orc
181181
return nil, err
182182
}
183183

184-
cluster, err := swarm.NewCluster(swarmAddr, 30*time.Second, 90*time.Second)
184+
cluster, err := swarm.NewCluster(swarmAddr, 10*time.Second, 20*time.Second)
185185
if err != nil {
186186
return nil, err
187187
}

engine/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ type Guard struct {
2727
const (
2828
EtcdResourcesKey = "/lain/config/resources"
2929
EtcdGuardSwitchKey = "/lain/config/guardswitch"
30+
EtcdCloudVolumeRootKey = "/lain/config/cloud_volumes_root"
31+
EtcdVolumeRootKey = "/lain/config/volumes_root"
3032

3133
EtcdConfigKey = "/lain/deployd/engine/config"
3234

engine/engine.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"strings"
77
"sync"
8+
"sync/atomic"
89
"time"
910

1011
"github.com/laincloud/deployd/cluster"
@@ -262,7 +263,7 @@ func (engine *OrcEngine) RescheduleSpec(name string, podSpec PodSpec) error {
262263
if pgCtrl, ok := engine.pgCtrls[name]; !ok {
263264
return ErrPodGroupNotExists
264265
} else {
265-
if err := canOperation(pgCtrl, PGOpStateScheduling); err != nil {
266+
if err := canOperation(pgCtrl, PGOpStateUpgrading); err != nil {
266267
return err
267268
}
268269
for _, depends := range podSpec.Dependencies {
@@ -567,10 +568,7 @@ func (engine *OrcEngine) initOperationWorker() {
567568
rInterval := RefreshInterval / 2 * 1000 / len(engine.pgCtrls)
568569
index := 0
569570
for _, pgCtrl := range engine.pgCtrls {
570-
pgCtrl.RLock()
571-
refreshable := pgCtrl.refreshable
572-
pgCtrl.RUnlock()
573-
if refreshable {
571+
if atomic.LoadInt32(&(pgCtrl.refreshable)) == 1 {
574572
interval := index * rInterval
575573
_pgCtrl := pgCtrl
576574
index++
@@ -652,8 +650,8 @@ func (engine *OrcEngine) checkPodGroupRemoveResult(name string, pgCtrl *podGroup
652650
}
653651

654652
func canOperation(pgCtrl *podGroupController, target PGOpState) error {
655-
if opState := pgCtrl.CanOperate(target); opState != PGOpStateIdle {
656-
return OperLockedError{info: opState.String()}
653+
if canOp := pgCtrl.CanOperate(target); !canOp {
654+
return OperLockedError{info: "Scheduling"}
657655
}
658656
return nil
659657
}

engine/events.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package engine
33
import (
44
"strings"
55
"sync"
6+
"sync/atomic"
67

78
"github.com/laincloud/deployd/utils/util"
89
"github.com/mijia/adoc"
@@ -76,13 +77,10 @@ func handleDieEvent(engine *OrcEngine, event *adoc.Event) {
7677
if !ok {
7778
return
7879
}
79-
8080
pgCtrl.RLock()
81-
state := pgCtrl.opState
8281
spec := pgCtrl.spec.Clone()
8382
pgCtrl.RUnlock()
84-
85-
if state != PGOpStateScheduling {
83+
if atomic.LoadInt32((*int32)(&pgCtrl.opState)) != PGOpStateUpgrading {
8684
log.Warnf("got %s event from %s, refresh this instance", event.Status, name)
8785
pgCtrl.opsChan <- pgOperRefreshInstance{instance, spec}
8886
}

engine/pod.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ var RestartInfoClearInterval time.Duration
2222

2323
const (
2424
DefaultHealthInterval = 10
25-
DefaultHealthTimeout = 1
25+
DefaultHealthTimeout = 3
2626
DefaultHealthRetries = 3
2727

2828
DefaultSetUpTime = 20
@@ -32,7 +32,7 @@ const (
3232
CPUMaxLevel = 8
3333
CPUDeafultLevel = 2
3434

35-
CURL_TMPLT = `echo $(timeout %v curl -s -o /dev/null -w '%%{http_code}\n' %s) | grep -Eq "^[2-3]..$"`
35+
CURL_TMPLT = `echo $(curl -m %v -s -o /dev/null -w '%%{http_code}\n' %s) | grep -Eq "^[2-3]..$"`
3636
)
3737

3838
// podController is controlled by the podGroupController
@@ -522,10 +522,12 @@ func (pc *podController) createHostConfig(index int) adoc.HostConfig {
522522
BlkioDeviceWriteIOps = append(BlkioDeviceWriteIOps, iops)
523523
}
524524
}
525+
swappiness := int64(0)
525526
hc := adoc.HostConfig{
526527
Resources: adoc.Resources{
527528
Memory: spec.MemoryLimit,
528529
MemorySwap: spec.MemoryLimit, // Memory == MemorySwap means disable swap
530+
MemorySwappiness: &swappiness,
529531
CPUPeriod: CPUQuota,
530532
CPUQuota: int64(spec.CpuLimit*resource.Cpu*CPUMaxPctg) * CPUQuota / int64(CPUMaxLevel*100),
531533
BlkioDeviceReadBps: BlkioDeviceReadBps,

engine/podgroup.go

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"strings"
77
"sync"
8+
"sync/atomic"
89
"time"
910

1011
"github.com/laincloud/deployd/cluster"
@@ -39,7 +40,7 @@ type podGroupController struct {
3940
podCtrls []*podController
4041
opsChan chan pgOperation
4142

42-
refreshable bool
43+
refreshable int32
4344

4445
lastPodSpecKey string
4546
storedKey string
@@ -50,33 +51,32 @@ func (pgCtrl *podGroupController) String() string {
5051
return fmt.Sprintf("PodGroupCtrl %s", pgCtrl.spec)
5152
}
5253

53-
func (pgCtrl *podGroupController) CanOperate(pgops PGOpState) PGOpState {
54-
pgCtrl.Lock()
55-
defer pgCtrl.Unlock()
56-
if pgCtrl.opState == PGOpStateIdle {
57-
pgCtrl.opState = pgops
58-
return PGOpStateIdle
54+
func (pgCtrl *podGroupController) CanOperate(pgops PGOpState) bool {
55+
if atomic.CompareAndSwapInt32((*int32)(&pgCtrl.opState), PGOpStateIdle, int32(pgops)) {
56+
pgCtrl.DisableRefresh()
57+
return true
58+
} else if atomic.LoadInt32((*int32)(&pgCtrl.opState)) == PGOpStateUpgrading &&
59+
pgops == PGOpStateUpgrading {
60+
// when pg is in upgreading state so flush old opchans and start new op
61+
pgCtrl.DisableRefresh()
62+
return true
5963
}
60-
return pgCtrl.opState
64+
return false
6165
}
6266

6367
func (pgCtrl *podGroupController) DisableRefresh() {
64-
pgCtrl.Lock()
65-
defer pgCtrl.Unlock()
66-
pgCtrl.refreshable = false
68+
atomic.StoreInt32(&(pgCtrl.refreshable), int32(0))
6769
}
6870

6971
func (pgCtrl *podGroupController) EnableRefresh() {
70-
pgCtrl.Lock()
71-
defer pgCtrl.Unlock()
72-
pgCtrl.refreshable = true
72+
atomic.StoreInt32(&(pgCtrl.refreshable), int32(1))
7373
}
7474

7575
// called by signle goroutine
7676
func (pgCtrl *podGroupController) OperateOver() {
77-
pgCtrl.Lock()
78-
defer pgCtrl.Unlock()
79-
pgCtrl.opState = PGOpStateIdle
77+
pgCtrl.emitOperationEvent(OperationOver)
78+
atomic.StoreInt32((*int32)(&pgCtrl.opState), PGOpStateIdle)
79+
pgCtrl.EnableRefresh()
8080
}
8181

8282
func (pgCtrl *podGroupController) Inspect() PodGroupWithSpec {
@@ -112,6 +112,7 @@ func (pgCtrl *podGroupController) IsPending() bool {
112112
}
113113

114114
func (pgCtrl *podGroupController) Deploy() {
115+
pgCtrl.flushAllOps()
115116
pgCtrl.emitOperationEvent(OperationStart)
116117
defer func() {
117118
pgCtrl.opsChan <- pgOperOver{}
@@ -139,6 +140,7 @@ func (pgCtrl *podGroupController) Deploy() {
139140
}
140141

141142
func (pgCtrl *podGroupController) RescheduleInstance(numInstances int, restartPolicy ...RestartPolicy) {
143+
pgCtrl.flushAllOps()
142144
pgCtrl.emitOperationEvent(OperationStart)
143145
defer func() {
144146
pgCtrl.opsChan <- pgOperOver{}
@@ -190,17 +192,14 @@ func (pgCtrl *podGroupController) RescheduleInstance(numInstances int, restartPo
190192
}
191193

192194
func (pgCtrl *podGroupController) RescheduleSpec(podSpec PodSpec) {
195+
pgCtrl.flushAllOps()
193196
pgCtrl.emitOperationEvent(OperationStart)
194197
defer func() {
195198
pgCtrl.opsChan <- pgOperOver{}
196199
}()
197200
pgCtrl.RLock()
198201
spec := pgCtrl.spec.Clone()
199202
pgCtrl.RUnlock()
200-
// pgCtrl.group.Pods[0].NodeName()
201-
if spec.Pod.Equals(podSpec) {
202-
return
203-
}
204203
pgCtrl.emptyError()
205204
if ok := pgCtrl.updatePodPorts(podSpec); !ok {
206205
return
@@ -230,6 +229,7 @@ func (pgCtrl *podGroupController) RescheduleSpec(podSpec PodSpec) {
230229
}
231230

232231
func (pgCtrl *podGroupController) RescheduleDrift(fromNode, toNode string, instanceNo int, force bool) {
232+
pgCtrl.flushAllOps()
233233
defer func() {
234234
pgCtrl.opsChan <- pgOperOver{}
235235
}()
@@ -254,6 +254,7 @@ func (pgCtrl *podGroupController) RescheduleDrift(fromNode, toNode string, insta
254254
}
255255

256256
func (pgCtrl *podGroupController) Remove() {
257+
pgCtrl.flushAllOps()
257258
pgCtrl.emitOperationEvent(OperationStart)
258259
defer func() {
259260
pgCtrl.opsChan <- pgOperOver{}
@@ -273,6 +274,7 @@ func (pgCtrl *podGroupController) Remove() {
273274
}
274275

275276
func (pgCtrl *podGroupController) ChangeState(op string, instance int) {
277+
pgCtrl.flushAllOps()
276278
pgCtrl.emitOperationEvent(OperationStart)
277279
defer func() {
278280
pgCtrl.opsChan <- pgOperOver{}
@@ -295,6 +297,10 @@ func (pgCtrl *podGroupController) Refresh(force bool) {
295297
if pgCtrl.IsRemoved() || pgCtrl.IsPending() {
296298
return
297299
}
300+
pgCtrl.DisableRefresh()
301+
defer func() {
302+
pgCtrl.opsChan <- pgOperOver{}
303+
}()
298304
pgCtrl.RLock()
299305
spec := pgCtrl.spec.Clone()
300306
pgCtrl.RUnlock()
@@ -328,11 +334,22 @@ func (pgCtrl *podGroupController) Activate(c cluster.Cluster, store storage.Stor
328334
}()
329335
}
330336

337+
func (pgCtrl *podGroupController) LastSpec() *PodGroupSpec {
338+
log.Infof("Fetch LastPodSpec !")
339+
var lastSpec PodGroupSpec
340+
if err := pgCtrl.engine.store.Get(pgCtrl.lastPodSpecKey, &lastSpec); err != nil {
341+
log.Infof("Fetch LastPodSpec with err:%v", err)
342+
return nil
343+
}
344+
log.Infof("Fetch LastPodSpec :%v", lastSpec)
345+
return &lastSpec
346+
}
347+
331348
/*
332349
* clean all ops in chan synchronously
333350
*
334351
*/
335-
func (pgCtrl *podGroupController) FlushAllOps() {
352+
func (pgCtrl *podGroupController) flushAllOps() {
336353
for {
337354
if len(pgCtrl.opsChan) == 0 {
338355
return
@@ -345,17 +362,6 @@ func (pgCtrl *podGroupController) FlushAllOps() {
345362
}
346363
}
347364

348-
func (pgCtrl *podGroupController) LastSpec() *PodGroupSpec {
349-
log.Infof("Fetch LastPodSpec !")
350-
var lastSpec PodGroupSpec
351-
if err := pgCtrl.engine.store.Get(pgCtrl.lastPodSpecKey, &lastSpec); err != nil {
352-
log.Infof("Fetch LastPodSpec with err:%v", err)
353-
return nil
354-
}
355-
log.Infof("Fetch LastPodSpec :%v", lastSpec)
356-
return &lastSpec
357-
}
358-
359365
/*
360366
* To clean corrupted containers which do not used by cluster app any more
361367
* Should be called just after refrehsed podgroups or clean will works terrible
@@ -492,9 +498,6 @@ func (pgCtrl *podGroupController) checkPodPorts() bool {
492498

493499
func (pgCtrl *podGroupController) updatePodPorts(podSpec PodSpec) bool {
494500
spec := pgCtrl.spec
495-
if spec.Pod.Equals(podSpec) {
496-
return true
497-
}
498501
var oldsps, sps StreamPorts
499502
if err := json.Unmarshal([]byte(spec.Pod.Annotation), &oldsps); err != nil {
500503
log.Errorf("annotation unmarshal error:%v\n", err)
@@ -573,7 +576,7 @@ func (pgCtrl *podGroupController) rollBack() bool {
573576
// 1. disable refresh(so no others can produce operation) and flush ops chan
574577
log.Infof("Start Rollback!")
575578
pgCtrl.DisableRefresh()
576-
pgCtrl.FlushAllOps()
579+
pgCtrl.flushAllOps()
577580
// 2. rollback podgroup podspec info
578581
pgCtrl.Lock()
579582
spec := pgCtrl.spec.Clone()
@@ -692,7 +695,7 @@ func newPodGroupController(spec PodGroupSpec, states []PodPrevState, pg PodGroup
692695
podCtrls: podCtrls,
693696
opsChan: make(chan pgOperation, 500),
694697

695-
refreshable: true,
698+
refreshable: 1,
696699

697700
lastPodSpecKey: strings.Join([]string{kLainDeploydRootKey, kLainLastPodSpecKey, spec.Namespace, spec.Name}, "/"),
698701
storedKey: strings.Join([]string{kLainDeploydRootKey, kLainPodGroupKey, spec.Namespace, spec.Name}, "/"),

engine/podgroup_ops.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ func (op pgOperRefreshInstance) Do(pgCtrl *podGroupController, c cluster.Cluster
158158
pgCtrl.RUnlock()
159159
}()
160160

161+
if(op.instanceNo > len(pgCtrl.podCtrls)){
162+
log.Warnf("Pod is not exists")
163+
return false
164+
}
161165
podCtrl := pgCtrl.podCtrls[op.instanceNo-1]
162166

163167
podCtrl.Refresh(c)
@@ -604,6 +608,5 @@ type pgOperOver struct {
604608

605609
func (op pgOperOver) Do(pgCtrl *podGroupController, c cluster.Cluster, store storage.Store, ev *RuntimeEagleView) bool {
606610
pgCtrl.OperateOver()
607-
pgCtrl.emitOperationEvent(OperationOver)
608611
return false
609612
}

engine/runtimes.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ const (
3939

4040
const (
4141
PGOpStateIdle = iota
42+
PGOpStateUpgrading
4243
PGOpStateScheduling
4344
PGOpStateDrifting
4445
PGOpStateRemoving
@@ -102,6 +103,8 @@ func (pgos PGOpState) String() string {
102103
switch pgos {
103104
case PGOpStateIdle:
104105
return "Idle"
106+
case PGOpStateUpgrading:
107+
return "Upgrading"
105108
case PGOpStateScheduling:
106109
return "Scheduling"
107110
case PGOpStateDrifting:

engine/specs.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ const (
3232
MinPodKillTimeout = 10
3333
MaxPodKillTimeout = 120
3434

35-
etcdCloudVolumeRootKey = "/lain/config/cloud_volumes_root"
36-
etcdVolumeRootKey = "/lain/config/volumes_root"
3735
)
3836

3937
var (
@@ -60,11 +58,11 @@ type ContainerLabel struct {
6058
}
6159

6260
func configSpecsVars(store storage.Store) error {
63-
if v, err := store.GetRaw(etcdCloudVolumeRootKey); err == nil {
61+
if v, err := store.GetRaw(EtcdCloudVolumeRootKey); err == nil {
6462
kLainCloudVolumeRoot = v
6563
}
6664

67-
if v, err := store.GetRaw(etcdVolumeRootKey); err == nil {
65+
if v, err := store.GetRaw(EtcdVolumeRootKey); err == nil {
6866
kLainVolumeRoot = v
6967
}
7068
log.Debugf("cloud_volume_root: %s, volumes_root: %s", kLainCloudVolumeRoot, kLainVolumeRoot)

0 commit comments

Comments
 (0)