@@ -2,15 +2,18 @@ package common
22
33import (
44 "context"
5+ "errors"
56 "fmt"
67 "slices"
78 "sync"
89 "testing"
10+ "testing/synctest"
911 "time"
1012
1113 "github.com/prometheus/client_golang/prometheus"
1214 promclient "github.com/prometheus/client_model/go"
1315 "github.com/stretchr/testify/require"
16+ "go.uber.org/goleak"
1417
1518 "github.com/authzed/spicedb/internal/datastore/revisions"
1619 "github.com/authzed/spicedb/pkg/datastore"
@@ -181,50 +184,50 @@ func (d revisionErrorDeleter) DeleteExpiredRels() (int64, error) {
181184 return 0 , nil
182185}
183186
187+ func alwaysErr () error {
188+ return errors .New ("aaagh" )
189+ }
190+
184191func TestGCFailureBackoff (t * testing.T ) {
192+ t .Cleanup (func () {
193+ goleak .VerifyNone (t )
194+ })
185195 localCounter := prometheus .NewCounter (gcFailureCounterConfig )
186196 reg := prometheus .NewRegistry ()
187197 require .NoError (t , reg .Register (localCounter ))
188198
189- ctx , cancel := context .WithCancel (t .Context ())
190- defer cancel ()
191- go func () {
192- gc := newFakeGCStore (alwaysErrorDeleter {})
193- require .Error (t , startGarbageCollectorWithMaxElapsedTime (ctx , gc , 100 * time .Millisecond , 1 * time .Second , 1 * time .Nanosecond , 1 * time .Minute , localCounter ))
194- }()
195- time .Sleep (200 * time .Millisecond )
196- cancel ()
199+ errCh := make (chan error , 1 )
200+ synctest .Test (t , func (t * testing.T ) {
201+ duration := 1000 * time .Second
202+ ctx , cancel := context .WithTimeout (t .Context (), duration )
203+ t .Cleanup (func () {
204+ cancel ()
205+ })
206+ go func () {
207+ errCh <- runOnIntervalWithBackoff (ctx , alwaysErr , 100 * time .Second , 1 * time .Minute , localCounter )
208+ }()
209+ time .Sleep (duration )
210+ synctest .Wait ()
211+ })
212+ require .Error (t , <- errCh )
197213
198214 metrics , err := reg .Gather ()
199215 require .NoError (t , err )
200216 var mf * promclient.MetricFamily
201217 for _ , metric := range metrics {
202218 if metric .GetName () == "spicedb_datastore_gc_failure_total" {
203219 mf = metric
220+ break
204221 }
205222 }
206- require .Greater (t , * (mf .GetMetric ()[0 ].Counter .Value ), 100.0 , "MaxElapsedTime=1ns did not cause backoff to get ignored" )
207-
208- localCounter = prometheus .NewCounter (gcFailureCounterConfig )
209- reg = prometheus .NewRegistry ()
210- require .NoError (t , reg .Register (localCounter ))
211- ctx , cancel = context .WithCancel (t .Context ())
212- defer cancel ()
213- go func () {
214- gc := newFakeGCStore (alwaysErrorDeleter {})
215- require .Error (t , startGarbageCollectorWithMaxElapsedTime (ctx , gc , 100 * time .Millisecond , 0 , 1 * time .Second , 1 * time .Minute , localCounter ))
216- }()
217- time .Sleep (200 * time .Millisecond )
218- cancel ()
219-
220- metrics , err = reg .Gather ()
221- require .NoError (t , err )
222- for _ , metric := range metrics {
223- if metric .GetName () == "spicedb_datastore_gc_failure_total" {
224- mf = metric
225- }
226- }
227- require .Less (t , * (mf .GetMetric ()[0 ].Counter .Value ), 3.0 , "MaxElapsedTime=0 should have not caused backoff to get ignored" )
223+ // We expect about 5 failures; the behavior of the library means that there's some wiggle room here.
224+ // (owing to the jitter in the backoff)
225+ // we should see failures at 100s, 200s, 400s, 800s
226+ // but depending on jitter this could end up being squished down such that we get 5 failures.
227+ // Experimentally, we see 5 most often and 4 sometimes, so asserting greater than 3 works here.
228+ fmt .Println ("failures" )
229+ fmt .Println (* (mf .GetMetric ()[0 ].Counter .Value ))
230+ require .Greater (t , * (mf .GetMetric ()[0 ].Counter .Value ), 3.0 , "did not see expected number of backoffs" )
228231}
229232
230233// Ensure the garbage collector interval is reset after recovering from an
@@ -238,23 +241,29 @@ func TestGCFailureBackoffReset(t *testing.T) {
238241 errorOnRevisions : []uint64 {1 , 2 , 3 , 4 , 5 },
239242 })
240243
241- ctx , cancel := context .WithCancel (t .Context ())
242- defer cancel ()
243-
244- go func () {
245- interval := 10 * time .Millisecond
246- window := 10 * time .Second
247- timeout := 1 * time .Minute
248-
249- require .Error (t , StartGarbageCollector (ctx , gc , interval , window , timeout ))
250- }()
244+ errCh := make (chan error , 1 )
245+ synctest .Test (t , func (t * testing.T ) {
246+ ctx , cancel := context .WithCancel (t .Context ())
247+ go func () {
248+ interval := 10 * time .Millisecond
249+ window := 10 * time .Second
250+ timeout := 1 * time .Minute
251+
252+ errCh <- StartGarbageCollector (ctx , gc , interval , window , timeout )
253+ }()
254+ // The garbage collector should fail 5 times starting with 10ms interval,
255+ // which should take ~160ms to complete. after that, it should resume
256+ // completing a revision every 10ms, which should get us at least 20
257+ // successful iterations with a 500ms total execution time.
258+ // If the interval is not reset, it should blow past the 500ms timer
259+ // after only completing ~3 iterations.
260+ time .Sleep (500 * time .Millisecond )
261+ cancel ()
262+ synctest .Wait ()
263+ })
251264
252- time .Sleep (500 * time .Millisecond )
253- cancel ()
265+ require .Error (t , <- errCh )
254266
255- // The next interval should have been reset after recovering from the error.
256- // If it is not reset, the last exponential backoff interval will not give
257- // the GC enough time to run.
258267 gc .lock .Lock ()
259268 defer gc .lock .Unlock ()
260269
@@ -264,20 +273,29 @@ func TestGCFailureBackoffReset(t *testing.T) {
264273func TestGCUnlockOnTimeout (t * testing.T ) {
265274 gc := newFakeGCStore (alwaysErrorDeleter {})
266275
267- ctx , cancel := context .WithCancel (t .Context ())
268- defer cancel ()
269-
270- go func () {
271- interval := 10 * time .Millisecond
272- window := 10 * time .Second
273- timeout := 1 * time .Millisecond
274-
275- require .Error (t , StartGarbageCollector (ctx , gc , interval , window , timeout ))
276- }()
277-
278- time .Sleep (30 * time .Millisecond )
279- require .False (t , gc .HasGCRun (), "GC should not have run" )
276+ errCh := make (chan error , 1 )
277+ hasRunChan := make (chan bool , 1 )
278+ synctest .Test (t , func (t * testing.T ) {
279+ ctx , cancel := context .WithCancel (t .Context ())
280+ t .Cleanup (func () {
281+ cancel ()
282+ })
283+ go func () {
284+ interval := 10 * time .Millisecond
285+ window := 10 * time .Second
286+ timeout := 1 * time .Minute
287+
288+ errCh <- StartGarbageCollector (ctx , gc , interval , window , timeout )
289+ }()
290+ time .Sleep (30 * time .Millisecond )
291+ hasRunChan <- gc .HasGCRun ()
292+ cancel ()
293+ synctest .Wait ()
294+ })
295+ require .Error (t , <- errCh )
296+ require .False (t , <- hasRunChan , "GC should not have run because it should always be erroring." )
280297
298+ // TODO: should this be inside the goroutine as well?
281299 gc .fakeGC .lock .Lock ()
282300 defer gc .fakeGC .lock .Unlock ()
283301
0 commit comments