Skip to content

Commit 3331f1b

Browse files
committed
Allow 5s or 20% grace on per-job disruption testing
1 parent 4205f21 commit 3331f1b

File tree

1 file changed

+30
-6
lines changed

1 file changed

+30
-6
lines changed

pkg/monitortestlibrary/disruptionlibrary/disruption_invariant_adapter.go

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
_ "embed"
66
"fmt"
7+
"math"
78
"strings"
89
"sync"
910
"time"
@@ -111,25 +112,48 @@ func createDisruptionJunit(testName string, allowedDisruption *time.Duration, di
111112
}
112113
}
113114

115+
disruptionDuration := disruptedIntervals.Duration(1 * time.Second)
116+
roundedDisruptionDuration := disruptionDuration.Round(time.Second)
117+
118+
// Determine what amount of disruption we're willing to tolerate before we fail the test. We previously just
119+
// enforced being over a P99 over the past 3 weeks, however the P99 fluctuates wildly even under these
120+
// conditions, and the tests fail excessively on very low numbers. Thus we now also allow a grace amount to try to
121+
// establish this as a first line of defence to detect egregious regressions before they merge.
122+
//roundedAllowedDisruption, additionalDetails := calculateAllowedDisruptionWithGrace(*allowedDisruption)
123+
allowedDetails := []string{}
124+
allowedDetails = append(allowedDetails, fmt.Sprintf("P99 from historical data for similar jobs over past 3 weeks: %s",
125+
*allowedDisruption))
114126
if *allowedDisruption < 1*time.Second {
115127
t := 1 * time.Second
116128
allowedDisruption = &t
117-
disruptionDetails = "always allow at least one second"
129+
allowedDetails = append(allowedDetails, "rounded P99 up to always allow one second")
118130
}
119131

120-
disruptionDuration := disruptedIntervals.Duration(1 * time.Second)
121-
roundedAllowedDisruption := allowedDisruption.Round(time.Second)
122-
roundedDisruptionDuration := disruptionDuration.Round(time.Second)
132+
// Allow grace of 5s or 20%, at this layer, with one sample, we're only hoping to find really severe disruption:
133+
allowedSecs := allowedDisruption.Seconds()
134+
allowedSecsWithGrace := allowedSecs + 5.0
135+
allowedSecsPlus20Percent := allowedSecs * 1.2
136+
if allowedSecsPlus20Percent > allowedSecsWithGrace {
137+
allowedSecsWithGrace = allowedSecsPlus20Percent
138+
allowedDetails = append(allowedDetails, "added an additional 20% of grace")
139+
} else {
140+
allowedDetails = append(allowedDetails, "added an additional 5s of grace")
141+
}
142+
roundedFinal := int64(math.Round(allowedSecsWithGrace))
143+
finalAllowedDisruption := time.Duration(roundedFinal) * time.Second
123144

124-
if roundedDisruptionDuration <= roundedAllowedDisruption {
145+
if roundedDisruptionDuration <= finalAllowedDisruption {
125146
return &junitapi.JUnitTestCase{
126147
Name: testName,
127148
}
128149
}
129150

130151
reason := fmt.Sprintf("%v was unreachable during disruption: %v", locator.OldLocator(), disruptionDetails)
131152
describe := disruptedIntervals.Strings()
132-
failureMessage := fmt.Sprintf("%s for at least %s (maxAllowed=%s):\n\n%s", reason, roundedDisruptionDuration, roundedAllowedDisruption, strings.Join(describe, "\n"))
153+
failureMessage := fmt.Sprintf("%s for at least %s (maxAllowed=%s):\n%s\n\n%s", reason,
154+
roundedDisruptionDuration, finalAllowedDisruption,
155+
strings.Join(allowedDetails, "\n"),
156+
strings.Join(describe, "\n"))
133157

134158
return &junitapi.JUnitTestCase{
135159
Name: testName,

0 commit comments

Comments
 (0)