@@ -10,78 +10,103 @@ import (
1010 "github.com/rs/zerolog"
1111
1212 "github.com/onflow/flow-go/module"
13+ "github.com/onflow/flow-go/module/component"
14+ "github.com/onflow/flow-go/module/irrecoverable"
1315)
1416
17+ // Cleaner uses component.ComponentManager to implement module.Startable and module.ReadyDoneAware
18+ // to run an internal goroutine which run badger value log garbage collection at a semi-regular interval.
19+ // The Cleaner exists for 2 reasons:
20+ // - Run GC frequently enough that each GC is relatively inexpensive
21+ // - Avoid GC being synchronized across all nodes. Since in the happy path, all nodes have very similar
22+ // database load patterns, without intervention they are likely to schedule GC at the same time, which
23+ // can cause temporary consensus halts.
1524type Cleaner struct {
16- log zerolog.Logger
17- db * badger.DB
18- metrics module.CleanerMetrics
19- enabled bool
20- ratio float64
21- freq int
22- calls int
25+ component.Component
26+ log zerolog.Logger
27+ db * badger.DB
28+ metrics module.CleanerMetrics
29+ ratio float64
30+ interval time.Duration
2331}
2432
25- // NewCleaner returns a cleaner that runs the badger value log garbage collection once every `frequency` calls
26- // if a frequency of zero is passed in, we will not run the GC at all
27- func NewCleaner (log zerolog.Logger , db * badger.DB , metrics module.CleanerMetrics , frequency int ) * Cleaner {
33+ var _ component.Component = (* Cleaner )(nil )
34+
35+ // NewCleaner returns a cleaner that runs the badger value log garbage collection once every `interval` duration
36+ // if an interval of zero is passed in, we will not run the GC at all.
37+ func NewCleaner (log zerolog.Logger , db * badger.DB , metrics module.CleanerMetrics , interval time.Duration ) * Cleaner {
2838 // NOTE: we run garbage collection frequently at points in our business
2939 // logic where we are likely to have a small breather in activity; it thus
3040 // makes sense to run garbage collection often, with a smaller ratio, rather
3141 // than running it rarely and having big rewrites at once
3242 c := & Cleaner {
33- log : log .With ().Str ("component" , "cleaner" ).Logger (),
34- db : db ,
35- metrics : metrics ,
36- ratio : 0.2 ,
37- freq : frequency ,
38- enabled : frequency > 0 , // Disable if passed in 0 as frequency
43+ log : log .With ().Str ("component" , "cleaner" ).Logger (),
44+ db : db ,
45+ metrics : metrics ,
46+ ratio : 0.2 ,
47+ interval : interval ,
3948 }
40- // we don't want the entire network to run GC at the same time, so
41- // distribute evenly over time
42- if c .enabled {
43- c .calls = rand .Intn (c .freq )
49+
50+ // Disable if passed in 0 as interval
51+ if c .interval == 0 {
52+ c .Component = & module.NoopComponent {}
53+ return c
4454 }
55+
56+ c .Component = component .NewComponentManagerBuilder ().
57+ AddWorker (c .gcWorkerRoutine ).
58+ Build ()
59+
4560 return c
4661}
4762
48- func (c * Cleaner ) RunGC () {
49- if ! c .enabled {
63+ // gcWorkerRoutine runs badger GC on timely basis.
64+ func (c * Cleaner ) gcWorkerRoutine (ctx irrecoverable.SignalerContext , ready component.ReadyFunc ) {
65+ ready ()
66+ ticker := time .NewTicker (c .nextWaitDuration ())
67+ defer ticker .Stop ()
68+ for {
69+ select {
70+ case <- ctx .Done ():
71+ return
72+ case <- ticker .C :
73+ c .runGC ()
74+
75+ // reset the ticker with a new interval and random jitter
76+ ticker .Reset (c .nextWaitDuration ())
77+ }
78+ }
79+ }
80+
81+ // nextWaitDuration calculates next duration for Cleaner to wait before attempting to run GC.
82+ // We add 20% jitter into the interval, so that we don't risk nodes syncing their GC calls over time.
83+ // Therefore GC is run every X seconds, where X is uniformly sampled from [interval, interval*1.2]
84+ func (c * Cleaner ) nextWaitDuration () time.Duration {
85+ return time .Duration (c .interval .Milliseconds () + rand .Int63n (c .interval .Milliseconds ()/ 5 ))
86+ }
87+
88+ // runGC runs garbage collection for badger DB, handles sentinel errors and reports metrics.
89+ func (c * Cleaner ) runGC () {
90+ started := time .Now ()
91+ err := c .db .RunValueLogGC (c .ratio )
92+ if err == badger .ErrRejected {
93+ // NOTE: this happens when a GC call is already running
94+ c .log .Warn ().Msg ("garbage collection on value log already running" )
5095 return
5196 }
52- // only actually run approximately every frequency number of calls
53- c .calls ++
54- if c .calls < c .freq {
97+ if err == badger .ErrNoRewrite {
98+ // NOTE: this happens when no files have any garbage to drop
99+ c .log .Debug ().Msg ("garbage collection on value log unnecessary" )
100+ return
101+ }
102+ if err != nil {
103+ c .log .Error ().Err (err ).Msg ("garbage collection on value log failed" )
55104 return
56105 }
57106
58- // we add 20% jitter into the interval, so that we don't risk nodes syncing
59- // up on their GC calls over time
60- c .calls = rand .Intn (c .freq / 5 )
61-
62- // run the garbage collection in own goroutine and handle sentinel errors
63- go func () {
64- started := time .Now ()
65- err := c .db .RunValueLogGC (c .ratio )
66- if err == badger .ErrRejected {
67- // NOTE: this happens when a GC call is already running
68- c .log .Warn ().Msg ("garbage collection on value log already running" )
69- return
70- }
71- if err == badger .ErrNoRewrite {
72- // NOTE: this happens when no files have any garbage to drop
73- c .log .Debug ().Msg ("garbage collection on value log unnecessary" )
74- return
75- }
76- if err != nil {
77- c .log .Error ().Err (err ).Msg ("garbage collection on value log failed" )
78- return
79- }
80-
81- runtime := time .Since (started )
82- c .log .Debug ().
83- Dur ("gc_duration" , runtime ).
84- Msg ("garbage collection on value log executed" )
85- c .metrics .RanGC (runtime )
86- }()
107+ runtime := time .Since (started )
108+ c .log .Debug ().
109+ Dur ("gc_duration" , runtime ).
110+ Msg ("garbage collection on value log executed" )
111+ c .metrics .RanGC (runtime )
87112}
0 commit comments