Skip to content

Commit 3a730da

Browse files
authored
add poller autoscaling in activity and decision workers (#1186)
* add pollerAutoScalerOptions * add tests for options * change autoscaler interface to include Stop method * add poller auto scaler to task worker * gracefully stop autoscaler * move up autoscaler in pollTask * add unit tests
1 parent d94db89 commit 3a730da

File tree

8 files changed

+325
-51
lines changed

8 files changed

+325
-51
lines changed

internal/common/autoscaler/autoscaler.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ type (
3131
// GetCurrent ResourceUnit of resource
3232
GetCurrent() ResourceUnit
3333
// Start starts the autoscaler go routine that scales the ResourceUnit according to Estimator
34-
Start() DoneFunc
34+
Start()
35+
// Stop stops the autoscaler if started or do nothing if not yet started
36+
Stop()
3537
}
36-
// DoneFunc func to turn off auto scaler
37-
DoneFunc func()
3838
)

internal/internal_poller_autoscaler.go

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,15 @@ import (
2727
"go.uber.org/atomic"
2828
"go.uber.org/cadence/internal/common/autoscaler"
2929
"go.uber.org/zap"
30+
"sync"
3031
"time"
3132
)
3233

3334
// defaultPollerScalerCooldownInSeconds
3435
const (
35-
defaultPollerScalerCooldownInSeconds = 120
36+
defaultPollerAutoScalerCooldown = time.Minute
37+
defaultPollerAutoScalerTargetUtilization = 0.6
38+
defaultMinConcurrentPollerSize = 1
3639
)
3740

3841
var (
@@ -50,6 +53,7 @@ type (
5053
sem semaphore.Semaphore // resizable semaphore to control number of concurrent pollers
5154
ctx context.Context
5255
cancel context.CancelFunc
56+
wg *sync.WaitGroup // graceful stop
5357
recommender autoscaler.Recommender
5458
onAutoScale []func() // hook functions that run post autoscale
5559
}
@@ -60,31 +64,41 @@ type (
6064
// This avoids unnecessary usage of CompareAndSwap
6165
atomicBits *atomic.Uint64
6266
}
67+
68+
pollerAutoScalerOptions struct {
69+
Enabled bool
70+
InitCount int
71+
MinCount int
72+
MaxCount int
73+
Cooldown time.Duration
74+
DryRun bool
75+
TargetUtilization float64
76+
}
6377
)
6478

6579
func newPollerScaler(
66-
initialPollerCount int,
67-
minPollerCount int,
68-
maxPollerCount int,
69-
targetMilliUsage uint64,
70-
isDryRun bool,
71-
cooldownTime time.Duration,
80+
options pollerAutoScalerOptions,
7281
logger *zap.Logger,
7382
hooks ...func()) *pollerAutoScaler {
7483
ctx, cancel := context.WithCancel(context.Background())
84+
if !options.Enabled {
85+
return nil
86+
}
87+
7588
return &pollerAutoScaler{
76-
isDryRun: isDryRun,
77-
cooldownTime: cooldownTime,
89+
isDryRun: options.DryRun,
90+
cooldownTime: options.Cooldown,
7891
logger: logger,
79-
sem: semaphore.New(initialPollerCount),
92+
sem: semaphore.New(options.InitCount),
93+
wg: &sync.WaitGroup{},
8094
ctx: ctx,
8195
cancel: cancel,
8296
pollerUsageEstimator: pollerUsageEstimator{atomicBits: atomic.NewUint64(0)},
8397
recommender: autoscaler.NewLinearRecommender(
84-
autoscaler.ResourceUnit(minPollerCount),
85-
autoscaler.ResourceUnit(maxPollerCount),
98+
autoscaler.ResourceUnit(options.MinCount),
99+
autoscaler.ResourceUnit(options.MaxCount),
86100
autoscaler.Usages{
87-
autoscaler.PollerUtilizationRate: autoscaler.MilliUsage(targetMilliUsage),
101+
autoscaler.PollerUtilizationRate: autoscaler.MilliUsage(options.TargetUtilization * 1000),
88102
},
89103
),
90104
onAutoScale: hooks,
@@ -107,9 +121,11 @@ func (p *pollerAutoScaler) GetCurrent() autoscaler.ResourceUnit {
107121
}
108122

109123
// Start an auto-scaler go routine and returns a done to stop it
110-
func (p *pollerAutoScaler) Start() autoscaler.DoneFunc {
124+
func (p *pollerAutoScaler) Start() {
111125
logger := p.logger.Sugar()
126+
p.wg.Add(1)
112127
go func() {
128+
defer p.wg.Done()
113129
for {
114130
select {
115131
case <-p.ctx.Done():
@@ -139,9 +155,13 @@ func (p *pollerAutoScaler) Start() autoscaler.DoneFunc {
139155
}
140156
}
141157
}()
142-
return func() {
143-
p.cancel()
144-
}
158+
return
159+
}
160+
161+
// Stop stops the poller autoscaler
162+
func (p *pollerAutoScaler) Stop() {
163+
p.cancel()
164+
p.wg.Wait()
145165
}
146166

147167
// Reset metrics from the start

internal/internal_poller_autoscaler_test.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333

3434
func Test_pollerAutoscaler(t *testing.T) {
3535
type args struct {
36+
disabled bool
3637
noTaskPoll, taskPoll, unrelated int
3738
initialPollerCount int
3839
minPollerCount int
@@ -146,24 +147,36 @@ func Test_pollerAutoscaler(t *testing.T) {
146147
},
147148
want: 6,
148149
},
150+
{
151+
name: "disabled",
152+
args: args{disabled: true},
153+
},
149154
}
150155

151156
for _, tt := range tests {
152157
t.Run(tt.name, func(t *testing.T) {
153158
autoscalerEpoch := atomic.NewUint64(0)
154159
pollerScaler := newPollerScaler(
155-
tt.args.initialPollerCount,
156-
tt.args.minPollerCount,
157-
tt.args.maxPollerCount,
158-
tt.args.targetMilliUsage,
159-
tt.args.isDryRun,
160-
tt.args.cooldownTime,
160+
pollerAutoScalerOptions{
161+
Enabled: !tt.args.disabled,
162+
InitCount: tt.args.initialPollerCount,
163+
MinCount: tt.args.minPollerCount,
164+
MaxCount: tt.args.maxPollerCount,
165+
Cooldown: tt.args.cooldownTime,
166+
DryRun: tt.args.isDryRun,
167+
TargetUtilization: float64(tt.args.targetMilliUsage) / 1000,
168+
},
161169
zaptest.NewLogger(t),
162170
// hook function that collects number of iterations
163171
func() {
164172
autoscalerEpoch.Add(1)
165173
})
166-
pollerScalerDone := pollerScaler.Start()
174+
if tt.args.disabled {
175+
assert.Nil(t, pollerScaler)
176+
return
177+
}
178+
179+
pollerScaler.Start()
167180

168181
// simulate concurrent polling
169182
pollChan := generateRandomPollResults(tt.args.noTaskPoll, tt.args.taskPoll, tt.args.unrelated)
@@ -183,7 +196,7 @@ func Test_pollerAutoscaler(t *testing.T) {
183196
assert.Eventually(t, func() bool {
184197
return autoscalerEpoch.Load() == uint64(tt.args.autoScalerEpoch)
185198
}, tt.args.cooldownTime+20*time.Millisecond, 10*time.Millisecond)
186-
pollerScalerDone()
199+
pollerScaler.Stop()
187200
res := pollerScaler.GetCurrent()
188201
assert.Equal(t, tt.want, int(res))
189202
})

internal/internal_utils.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ const (
7171
type (
7272
FeatureFlags struct {
7373
WorkflowExecutionAlreadyCompletedErrorEnabled bool
74+
PollerAutoScalerEnabled bool
7475
}
7576
)
7677

internal/internal_worker.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,15 @@ func newWorkflowTaskWorkerInternal(
255255
params,
256256
)
257257
worker := newBaseWorker(baseWorkerOptions{
258+
pollerAutoScaler: pollerAutoScalerOptions{
259+
Enabled: params.FeatureFlags.PollerAutoScalerEnabled,
260+
InitCount: params.MaxConcurrentDecisionTaskPollers,
261+
MinCount: params.MinConcurrentDecisionTaskPollers,
262+
MaxCount: params.MaxConcurrentDecisionTaskPollers,
263+
Cooldown: params.PollerAutoScalerCooldown,
264+
DryRun: params.PollerAutoScalerDryRun,
265+
TargetUtilization: params.PollerAutoScalerTargetUtilization,
266+
},
258267
pollerCount: params.MaxConcurrentDecisionTaskPollers,
259268
pollerRate: defaultPollerRate,
260269
maxConcurrentTask: params.MaxConcurrentDecisionTaskExecutionSize,
@@ -443,6 +452,15 @@ func newActivityTaskWorker(
443452
ensureRequiredParams(&workerParams)
444453
base := newBaseWorker(
445454
baseWorkerOptions{
455+
pollerAutoScaler: pollerAutoScalerOptions{
456+
Enabled: workerParams.FeatureFlags.PollerAutoScalerEnabled,
457+
InitCount: workerParams.MaxConcurrentActivityTaskPollers,
458+
MinCount: workerParams.MinConcurrentActivityTaskPollers,
459+
MaxCount: workerParams.MaxConcurrentActivityTaskPollers,
460+
Cooldown: workerParams.PollerAutoScalerCooldown,
461+
DryRun: workerParams.PollerAutoScalerDryRun,
462+
TargetUtilization: workerParams.PollerAutoScalerTargetUtilization,
463+
},
446464
pollerCount: workerParams.MaxConcurrentActivityTaskPollers,
447465
pollerRate: defaultPollerRate,
448466
maxConcurrentTask: workerParams.MaxConcurrentActivityExecutionSize,
@@ -1198,6 +1216,18 @@ func augmentWorkerOptions(options WorkerOptions) WorkerOptions {
11981216
if options.MaxConcurrentSessionExecutionSize == 0 {
11991217
options.MaxConcurrentSessionExecutionSize = defaultMaxConcurrentSessionExecutionSize
12001218
}
1219+
if options.MinConcurrentActivityTaskPollers == 0 {
1220+
options.MinConcurrentActivityTaskPollers = defaultMinConcurrentPollerSize
1221+
}
1222+
if options.MinConcurrentDecisionTaskPollers == 0 {
1223+
options.MinConcurrentDecisionTaskPollers = defaultMinConcurrentPollerSize
1224+
}
1225+
if options.PollerAutoScalerCooldown == 0 {
1226+
options.PollerAutoScalerCooldown = defaultPollerAutoScalerCooldown
1227+
}
1228+
if options.PollerAutoScalerTargetUtilization == 0 {
1229+
options.PollerAutoScalerTargetUtilization = defaultPollerAutoScalerTargetUtilization
1230+
}
12011231

12021232
// if the user passes in a tracer then add a tracing context propagator
12031233
if options.Tracer != nil {

internal/internal_worker_base.go

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ type (
107107

108108
// baseWorkerOptions options to configure base worker.
109109
baseWorkerOptions struct {
110+
pollerAutoScaler pollerAutoScalerOptions
110111
pollerCount int
111112
pollerRate int
112113
maxConcurrentTask int
@@ -133,6 +134,7 @@ type (
133134
metricsScope tally.Scope
134135

135136
pollerRequestCh chan struct{}
137+
pollerAutoScaler *pollerAutoScaler
136138
taskQueueCh chan interface{}
137139
sessionTokenBucket *sessionTokenBucket
138140
}
@@ -156,15 +158,25 @@ func createPollRetryPolicy() backoff.RetryPolicy {
156158

157159
func newBaseWorker(options baseWorkerOptions, logger *zap.Logger, metricsScope tally.Scope, sessionTokenBucket *sessionTokenBucket) *baseWorker {
158160
ctx, cancel := context.WithCancel(context.Background())
161+
162+
var pollerAS *pollerAutoScaler
163+
if pollerOptions := options.pollerAutoScaler; pollerOptions.Enabled {
164+
pollerAS = newPollerScaler(
165+
pollerOptions,
166+
logger,
167+
)
168+
}
169+
159170
bw := &baseWorker{
160-
options: options,
161-
shutdownCh: make(chan struct{}),
162-
taskLimiter: rate.NewLimiter(rate.Limit(options.maxTaskPerSecond), 1),
163-
retrier: backoff.NewConcurrentRetrier(pollOperationRetryPolicy),
164-
logger: logger.With(zapcore.Field{Key: tagWorkerType, Type: zapcore.StringType, String: options.workerType}),
165-
metricsScope: tagScope(metricsScope, tagWorkerType, options.workerType),
166-
pollerRequestCh: make(chan struct{}, options.maxConcurrentTask),
167-
taskQueueCh: make(chan interface{}), // no buffer, so poller only able to poll new task after previous is dispatched.
171+
options: options,
172+
shutdownCh: make(chan struct{}),
173+
taskLimiter: rate.NewLimiter(rate.Limit(options.maxTaskPerSecond), 1),
174+
retrier: backoff.NewConcurrentRetrier(pollOperationRetryPolicy),
175+
logger: logger.With(zapcore.Field{Key: tagWorkerType, Type: zapcore.StringType, String: options.workerType}),
176+
metricsScope: tagScope(metricsScope, tagWorkerType, options.workerType),
177+
pollerRequestCh: make(chan struct{}, options.maxConcurrentTask),
178+
pollerAutoScaler: pollerAS,
179+
taskQueueCh: make(chan interface{}), // no buffer, so poller only able to poll new task after previous is dispatched.
168180

169181
limiterContext: ctx,
170182
limiterContextCancel: cancel,
@@ -185,6 +197,10 @@ func (bw *baseWorker) Start() {
185197

186198
bw.metricsScope.Counter(metrics.WorkerStartCounter).Inc(1)
187199

200+
if bw.pollerAutoScaler != nil {
201+
bw.pollerAutoScaler.Start()
202+
}
203+
188204
for i := 0; i < bw.options.pollerCount; i++ {
189205
bw.shutdownWG.Add(1)
190206
go bw.runPoller()
@@ -255,9 +271,24 @@ func (bw *baseWorker) runTaskDispatcher() {
255271
}
256272
}
257273

274+
/*
275+
There are three types of constraint on polling tasks:
276+
1. poller auto scaler is to constraint number of concurrent pollers
277+
2. retrier is a backoff constraint on errors
278+
3. limiter is a per-second constraint
279+
*/
258280
func (bw *baseWorker) pollTask() {
259281
var err error
260282
var task interface{}
283+
284+
if bw.pollerAutoScaler != nil {
285+
if pErr := bw.pollerAutoScaler.Acquire(1); pErr == nil {
286+
defer bw.pollerAutoScaler.Release(1)
287+
} else {
288+
bw.logger.Warn("poller auto scaler acquire error", zap.Error(pErr))
289+
}
290+
}
291+
261292
bw.retrier.Throttle()
262293
if bw.pollLimiter == nil || bw.pollLimiter.Wait(bw.limiterContext) == nil {
263294
task, err = bw.options.taskWorker.PollTask()
@@ -273,6 +304,11 @@ func (bw *baseWorker) pollTask() {
273304
}
274305
bw.retrier.Failed()
275306
} else {
307+
if bw.pollerAutoScaler != nil {
308+
if pErr := bw.pollerAutoScaler.CollectUsage(task); pErr != nil {
309+
bw.logger.Warn("poller auto scaler collect usage error", zap.Error(pErr))
310+
}
311+
}
276312
bw.retrier.Succeeded()
277313
}
278314
}
@@ -347,6 +383,9 @@ func (bw *baseWorker) Stop() {
347383
}
348384
close(bw.shutdownCh)
349385
bw.limiterContextCancel()
386+
if bw.pollerAutoScaler != nil {
387+
bw.pollerAutoScaler.Stop()
388+
}
350389

351390
if success := util.AwaitWaitGroup(&bw.shutdownWG, bw.options.shutdownTimeout); !success {
352391
traceLog(func() {

0 commit comments

Comments
 (0)