Skip to content

Commit 967f4cc

Browse files
balance: fix connection is repeatedly migrated by CPU and location (#826) (#827)
Co-authored-by: djshow832 <[email protected]>
1 parent 870007e commit 967f4cc

File tree

6 files changed

+26
-25
lines changed

6 files changed

+26
-25
lines changed

pkg/balance/factor/factor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const (
1515
AdviceNeutral BalanceAdvice = iota
1616
// AdviceNegtive indicates don't balance these 2 backends, even for the rest factors.
1717
AdviceNegtive
18-
// AdvicePositive indicates balancing these 2 backends now.
18+
// AdvicePositive indicates balancing these 2 backends now. It only works when the source score is greater.
1919
AdvicePositive
2020
)
2121

@@ -27,7 +27,7 @@ type Factor interface {
2727
// ScoreBitNum returns the bit number of the score.
2828
ScoreBitNum() int
2929
// BalanceCount returns the count of connections to balance per second.
30-
// 0 indicates the factor is already balanced.
30+
// The caller ensures that the score of from is greater or equal to the score of to.
3131
BalanceCount(from, to scoredBackend) (BalanceAdvice, float64, []zap.Field)
3232
SetConfig(cfg *config.Config)
3333
// CanBeRouted returns whether a connection can be routed or migrated to the backend with the score.

pkg/balance/factor/factor_balance.go

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -271,33 +271,34 @@ func (fbb *FactorBasedBalance) BackendsToBalance(backends []policy.BackendCtx) (
271271
continue
272272
}
273273
leftBitNum := fbb.totalBitNum
274+
logFields = logFields[:0]
274275
for _, factor := range fbb.factors {
275276
bitNum := factor.ScoreBitNum()
276277
score1 := scoredBackends[i].scoreBits << (maxBitNum - leftBitNum) >> (maxBitNum - bitNum)
277278
score2 := scoredBackends[0].scoreBits << (maxBitNum - leftBitNum) >> (maxBitNum - bitNum)
278-
if score1 > score2 {
279+
if score1 >= score2 {
279280
// The factors with higher priorities are ordered, so this factor shouldn't violate them.
280281
// E.g. if the CPU usage of A is higher than B, don't migrate from B to A even if A is preferred in location.
281-
var fields []zap.Field
282282
var advice BalanceAdvice
283+
var fields []zap.Field
283284
advice, balanceCount, fields = factor.BalanceCount(scoredBackends[i], scoredBackends[0])
284285
if advice == AdviceNegtive {
285286
// If the factor will be unbalanced after migration, skip the rest factors.
286287
// E.g. if the CPU usage of A will be much higher than B after migration,
287288
// don't migrate from B to A even if A is preferred in location.
288289
break
289-
} else if advice == AdvicePositive {
290-
if balanceCount > 0.0001 {
291-
from, to = scoredBackends[i].BackendCtx, scoredBackends[0].BackendCtx
292-
reason = factor.Name()
293-
logFields = append(fields, zap.String("factor", reason),
294-
zap.String("from_total_score", strconv.FormatUint(scoredBackends[i].scoreBits, 16)),
295-
zap.String("to_total_score", strconv.FormatUint(scoredBackends[0].scoreBits, 16)),
296-
zap.Uint64("from_factor_score", score1),
297-
zap.Uint64("to_factor_score", score2),
298-
zap.Float64("balance_count", balanceCount))
299-
return
300-
}
290+
}
291+
logFields = append(logFields, fields...)
292+
if score1 > score2 && advice == AdvicePositive && balanceCount > 0.0001 {
293+
from, to = scoredBackends[i].BackendCtx, scoredBackends[0].BackendCtx
294+
reason = factor.Name()
295+
logFields = append(logFields, zap.String("factor", reason),
296+
zap.String("from_total_score", strconv.FormatUint(scoredBackends[i].scoreBits, 16)),
297+
zap.String("to_total_score", strconv.FormatUint(scoredBackends[0].scoreBits, 16)),
298+
zap.Uint64("from_factor_score", score1),
299+
zap.Uint64("to_factor_score", score2),
300+
zap.Float64("balance_count", balanceCount))
301+
return
301302
}
302303
} else if score1 < score2 {
303304
// Stop it once a factor is in the opposite order, otherwise a subsequent factor may violate this one.

pkg/balance/factor/factor_balance_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,8 @@ func TestCanBeRouted(t *testing.T) {
477477

478478
func TestCanBalance(t *testing.T) {
479479
fm := NewFactorBasedBalance(zap.NewNop(), newMockMetricsReader())
480-
factor1 := &mockFactor{bitNum: 2, balanceCount: 1, advice: AdviceNegtive}
481-
factor2 := &mockFactor{bitNum: 2, balanceCount: 1}
480+
factor1 := &mockFactor{bitNum: 2, balanceCount: 1, canBeRouted: true, advice: AdviceNegtive}
481+
factor2 := &mockFactor{bitNum: 2, balanceCount: 1, canBeRouted: true}
482482
fm.factors = []Factor{factor1, factor2}
483483
require.NoError(t, fm.updateBitNum())
484484

pkg/balance/factor/factor_cpu.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ func (fc *FactorCPU) BalanceCount(from, to scoredBackend) (BalanceAdvice, float6
270270
// E.g. 10% vs 25% don't need rebalance, but 80% vs 95% need rebalance.
271271
// Use the average usage to avoid thrash when CPU jitters too much and use the latest usage to avoid migrate too many connections.
272272
if 1.3-toAvgUsage < (1.3-fromAvgUsage)*cpuBalancedRatio || 1.3-toLatestUsage < (1.3-fromLatestUsage)*cpuBalancedRatio {
273-
return AdviceNeutral, 0, nil
273+
return AdviceNeutral, 0, fields
274274
}
275275
return AdvicePositive, 1 / fc.usagePerConn / balanceRatio4Cpu, fields
276276
}

pkg/balance/factor/factor_health.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -353,13 +353,13 @@ func (fh *FactorHealth) BalanceCount(from, to scoredBackend) (BalanceAdvice, flo
353353
// Only migrate connections when one is valueRangeNormal and the other is valueRangeAbnormal.
354354
fromScore := fh.caclErrScore(from.Addr())
355355
toScore := fh.caclErrScore(to.Addr())
356-
if fromScore-toScore <= 1 {
357-
return AdviceNeutral, 0, nil
358-
}
359356
snapshot := fh.snapshot[from.Addr()]
360357
fields := []zap.Field{
361358
zap.String("indicator", snapshot.indicator),
362359
zap.Int("value", snapshot.value)}
360+
if fromScore-toScore <= 1 {
361+
return AdviceNeutral, 0, fields
362+
}
363363
return AdvicePositive, snapshot.balanceCount, fields
364364
}
365365

pkg/balance/factor/factor_memory.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,14 +274,14 @@ func (fm *FactorMemory) BalanceCount(from, to scoredBackend) (BalanceAdvice, flo
274274
// So we only rebalance when the difference of risk levels of 2 backends is big enough.
275275
fromSnapshot := fm.snapshot[from.Addr()]
276276
toSnapshot := fm.snapshot[to.Addr()]
277-
if fromSnapshot.riskLevel-toSnapshot.riskLevel <= 1 {
278-
return AdviceNeutral, 0, nil
279-
}
280277
fields := []zap.Field{
281278
zap.Duration("from_time_to_oom", fromSnapshot.timeToOOM),
282279
zap.Float64("from_mem_usage", fromSnapshot.memUsage),
283280
zap.Duration("to_time_to_oom", toSnapshot.timeToOOM),
284281
zap.Float64("to_mem_usage", toSnapshot.memUsage)}
282+
if fromSnapshot.riskLevel-toSnapshot.riskLevel <= 1 {
283+
return AdviceNeutral, 0, fields
284+
}
285285
return AdvicePositive, fromSnapshot.balanceCount, fields
286286
}
287287

0 commit comments

Comments
 (0)