@@ -106,9 +106,11 @@ type changeAggregator struct {
106
106
// eventConsumer consumes the event.
107
107
eventConsumer eventConsumer
108
108
109
- nextHighWaterFlush time.Time // next time high watermark may be flushed.
110
- flushFrequency time.Duration // how often high watermark can be checkpointed.
111
- lastSpanFlush time.Time // last time expensive, span based checkpoint was written.
109
+ flushFrequency time.Duration // how often high watermark can be checkpointed.
110
+
111
+ // frontierFlushLimiter is a rate limiter for flushing the span frontier
112
+ // to the coordinator.
113
+ frontierFlushLimiter * saveRateLimiter
112
114
113
115
// frontier keeps track of resolved timestamps for spans along with schema change
114
116
// boundary information.
@@ -281,6 +283,22 @@ func newChangeAggregatorProcessor(
281
283
ca .flushFrequency = changefeedbase .DefaultMinCheckpointFrequency
282
284
}
283
285
286
+ ca .frontierFlushLimiter , err = newSaveRateLimiter (saveRateConfig {
287
+ name : "frontier" ,
288
+ intervalName : func () redact.SafeValue {
289
+ return redact .SafeString (changefeedbase .OptMinCheckpointFrequency )
290
+ },
291
+ interval : func () time.Duration {
292
+ return ca .flushFrequency
293
+ },
294
+ jitter : func () float64 {
295
+ return aggregatorFlushJitter .Get (& ca .FlowCtx .Cfg .Settings .SV )
296
+ },
297
+ }, timeutil.DefaultTimeSource {})
298
+ if err != nil {
299
+ return nil , err
300
+ }
301
+
284
302
return ca , nil
285
303
}
286
304
@@ -461,9 +479,6 @@ func (ca *changeAggregator) Start(ctx context.Context) {
461
479
462
480
// Init heartbeat timer.
463
481
ca .lastPush = timeutil .Now ()
464
-
465
- // Generate expensive checkpoint only after we ran for a while.
466
- ca .lastSpanFlush = timeutil .Now ()
467
482
}
468
483
469
484
func (ca * changeAggregator ) startKVFeed (
@@ -730,18 +745,6 @@ var aggregatorFlushJitter = settings.RegisterFloatSetting(
730
745
settings .WithPublic ,
731
746
)
732
747
733
- func nextFlushWithJitter (s timeutil.TimeSource , d time.Duration , j float64 ) (time.Time , error ) {
734
- if j < 0 || d < 0 {
735
- return s .Now (), errors .AssertionFailedf ("invalid jitter value: %f, duration: %s" , j , d )
736
- }
737
- maxJitter := int64 (j * float64 (d ))
738
- if maxJitter == 0 {
739
- return s .Now ().Add (d ), nil
740
- }
741
- nextFlush := d + time .Duration (rand .Int63n (maxJitter ))
742
- return s .Now ().Add (nextFlush ), nil
743
- }
744
-
745
748
// Next is part of the RowSource interface.
746
749
func (ca * changeAggregator ) Next () (rowenc.EncDatumRow , * execinfrapb.ProducerMetadata ) {
747
750
shouldEmitHeartBeat := func () bool {
@@ -897,7 +900,7 @@ func (ca *changeAggregator) flushBufferedEvents(ctx context.Context) error {
897
900
// noteResolvedSpan periodically flushes Frontier progress from the current
898
901
// changeAggregator node to the changeFrontier node to allow the changeFrontier
899
902
// to persist the overall changefeed's progress
900
- func (ca * changeAggregator ) noteResolvedSpan (resolved jobspb.ResolvedSpan ) ( returnErr error ) {
903
+ func (ca * changeAggregator ) noteResolvedSpan (resolved jobspb.ResolvedSpan ) error {
901
904
ctx , sp := tracing .ChildSpan (ca .Ctx (), "changefeed.aggregator.note_resolved_span" )
902
905
defer sp .Finish ()
903
906
@@ -935,32 +938,17 @@ func (ca *changeAggregator) noteResolvedSpan(resolved jobspb.ResolvedSpan) (retu
935
938
// TODO(yevgeniy): Consider doing something similar to how job checkpointing
936
939
// works in the frontier where if we missed the window to checkpoint, we will attempt
937
940
// the checkpoint at the next opportune moment.
938
- checkpointFrontier := advanced &&
939
- (forceFlush || timeutil .Now ().After (ca .nextHighWaterFlush ))
941
+ checkpointFrontier := (advanced && forceFlush ) || ca .frontierFlushLimiter .canSave (ctx )
940
942
941
943
if checkpointFrontier {
942
- defer func () {
943
- ca .nextHighWaterFlush , err = nextFlushWithJitter (
944
- timeutil.DefaultTimeSource {}, ca .flushFrequency , aggregatorFlushJitter .Get (sv ))
945
- if err != nil {
946
- returnErr = errors .CombineErrors (returnErr , err )
947
- }
948
- }()
949
- return ca .flushFrontier (ctx )
944
+ now := timeutil .Now ()
945
+ if err := ca .flushFrontier (ctx ); err != nil {
946
+ return err
947
+ }
948
+ ca .frontierFlushLimiter .doneSave (timeutil .Since (now ))
950
949
}
951
950
952
- // At a lower frequency, we checkpoint specific spans in the job progress
953
- // either in backfills or if the highwater mark is excessively lagging behind.
954
- checkpointSpans := (ca .frontier .InBackfill (resolved ) || ca .frontier .HasLaggingSpans (sv )) &&
955
- canCheckpointSpans (sv , ca .lastSpanFlush )
956
-
957
- if checkpointSpans {
958
- defer func () {
959
- ca .lastSpanFlush = timeutil .Now ()
960
- }()
961
- return ca .flushFrontier (ctx )
962
- }
963
- return returnErr
951
+ return nil
964
952
}
965
953
966
954
// flushFrontier flushes sink and emits resolved spans to the change frontier.
@@ -1313,8 +1301,18 @@ func newChangeFrontierProcessor(
1313
1301
cf .freqEmitResolved = emitNoResolved
1314
1302
}
1315
1303
1316
- cf .frontierPersistenceLimiter = newSaveRateLimiter (
1317
- "frontier" /* name */ , changefeedbase .FrontierPersistenceInterval )
1304
+ cf .frontierPersistenceLimiter , err = newSaveRateLimiter (saveRateConfig {
1305
+ name : "frontier" ,
1306
+ intervalName : func () redact.SafeValue {
1307
+ return changefeedbase .FrontierPersistenceInterval .Name ()
1308
+ },
1309
+ interval : func () time.Duration {
1310
+ return changefeedbase .FrontierPersistenceInterval .Get (& cf .FlowCtx .Cfg .Settings .SV )
1311
+ },
1312
+ }, timeutil.DefaultTimeSource {})
1313
+ if err != nil {
1314
+ return nil , err
1315
+ }
1318
1316
1319
1317
encodingOpts , err := opts .GetEncodingOptions ()
1320
1318
if err != nil {
@@ -1919,7 +1917,7 @@ func (cf *changeFrontier) maybePersistFrontier(ctx context.Context) error {
1919
1917
1920
1918
if cf .spec .JobID == 0 ||
1921
1919
! cf .evalCtx .Settings .Version .IsActive (ctx , clusterversion .V25_4 ) ||
1922
- ! cf .frontierPersistenceLimiter .canSave (ctx , & cf . FlowCtx . Cfg . Settings . SV ) {
1920
+ ! cf .frontierPersistenceLimiter .canSave (ctx ) {
1923
1921
return nil
1924
1922
}
1925
1923
@@ -2349,41 +2347,62 @@ func shouldCountUsageError(err error) bool {
2349
2347
status .Code (err ) != codes .Canceled
2350
2348
}
2351
2349
2352
- // durationSetting is a duration cluster setting.
2353
- type durationSetting interface {
2354
- Name () settings.SettingName
2355
- Get (sv * settings.Values ) time.Duration
2350
+ // saveRateConfig is the config for a saveRateLimiter.
2351
+ type saveRateConfig struct {
2352
+ name redact.SafeString
2353
+ intervalName func () redact.SafeValue
2354
+ interval func () time.Duration
2355
+ jitter func () float64 // optional
2356
2356
}
2357
2357
2358
2358
// saveRateLimiter is a rate limiter for saving a piece of progress.
2359
2359
// It uses a duration setting as the minimum interval between saves.
2360
2360
// It also limits saving to not be more frequent than the average
2361
2361
// duration it takes to save progress.
2362
2362
type saveRateLimiter struct {
2363
- name redact.SafeString
2364
- saveInterval durationSetting
2365
- warnEveryN util.EveryN
2363
+ config saveRateConfig
2364
+ warnEveryN util.EveryN
2365
+
2366
+ clock timeutil.TimeSource
2366
2367
2367
2368
lastSave time.Time
2368
2369
avgSaveDuration time.Duration
2369
2370
}
2370
2371
2371
2372
// newSaveRateLimiter returns a new saveRateLimiter.
2372
- func newSaveRateLimiter (name redact.SafeString , saveInterval durationSetting ) * saveRateLimiter {
2373
- return & saveRateLimiter {
2374
- name : name ,
2375
- saveInterval : saveInterval ,
2376
- warnEveryN : util .Every (time .Minute ),
2373
+ func newSaveRateLimiter (
2374
+ config saveRateConfig , clock timeutil.TimeSource ,
2375
+ ) (* saveRateLimiter , error ) {
2376
+ if len (config .name ) == 0 {
2377
+ return nil , errors .AssertionFailedf ("name is required" )
2378
+ }
2379
+ if config .intervalName == nil {
2380
+ return nil , errors .AssertionFailedf ("interval name is required" )
2381
+ }
2382
+ if config .interval == nil {
2383
+ return nil , errors .AssertionFailedf ("interval is required" )
2377
2384
}
2385
+ return & saveRateLimiter {
2386
+ config : config ,
2387
+ warnEveryN : util .Every (time .Minute ),
2388
+ clock : clock ,
2389
+ }, nil
2378
2390
}
2379
2391
2380
2392
// canSave returns whether enough time has passed to save progress again.
2381
- func (l * saveRateLimiter ) canSave (ctx context.Context , sv * settings. Values ) bool {
2382
- interval := l .saveInterval . Get ( sv )
2383
- if interval = = 0 {
2393
+ func (l * saveRateLimiter ) canSave (ctx context.Context ) bool {
2394
+ interval := l .config . interval ( )
2395
+ if interval < = 0 {
2384
2396
return false
2385
2397
}
2386
- now := timeutil .Now ()
2398
+ if l .config .jitter != nil {
2399
+ if jitter := l .config .jitter (); jitter > 0 {
2400
+ if maxJitter := time .Duration (jitter * float64 (interval )); maxJitter > 0 {
2401
+ interval += time .Duration (rand .Int63n (int64 (maxJitter ) + 1 ))
2402
+ }
2403
+ }
2404
+ }
2405
+ now := l .clock .Now ()
2387
2406
elapsed := now .Sub (l .lastSave )
2388
2407
if elapsed < interval {
2389
2408
return false
@@ -2393,8 +2412,7 @@ func (l *saveRateLimiter) canSave(ctx context.Context, sv *settings.Values) bool
2393
2412
log .Changefeed .Warningf (ctx , "cannot save %s even though %s has elapsed " +
2394
2413
"since last save and %s is set to %s because average duration to save was %s " +
2395
2414
"and further saving is disabled until that much time elapses" ,
2396
- l .name , elapsed , l .saveInterval .Name (),
2397
- interval , l .avgSaveDuration )
2415
+ l .config .name , elapsed , l .config .intervalName (), interval , l .avgSaveDuration )
2398
2416
}
2399
2417
return false
2400
2418
}
@@ -2404,7 +2422,7 @@ func (l *saveRateLimiter) canSave(ctx context.Context, sv *settings.Values) bool
2404
2422
// doneSave must be called after each save is completed with the duration
2405
2423
// it took to save progress.
2406
2424
func (l * saveRateLimiter ) doneSave (saveDuration time.Duration ) {
2407
- l .lastSave = timeutil .Now ()
2425
+ l .lastSave = l . clock .Now ()
2408
2426
2409
2427
// Update the average save duration using an exponential moving average.
2410
2428
if l .avgSaveDuration == 0 {
0 commit comments