|
4 | 4 | "context" |
5 | 5 | _ "embed" |
6 | 6 | "fmt" |
| 7 | + "math" |
7 | 8 | "strings" |
8 | 9 | "sync" |
9 | 10 | "time" |
@@ -111,25 +112,48 @@ func createDisruptionJunit(testName string, allowedDisruption *time.Duration, di |
111 | 112 | } |
112 | 113 | } |
113 | 114 |
|
| 115 | + disruptionDuration := disruptedIntervals.Duration(1 * time.Second) |
| 116 | + roundedDisruptionDuration := disruptionDuration.Round(time.Second) |
| 117 | + |
| 118 | + // Determine what amount of disruption we're willing to tolerate before we fail the test. We previously just |
| 119 | + // enforced being over a P99 over the past 3 weeks, however the P99 fluctuates wildly even under these |
| 120 | + // conditions, and the tests fail excessively on very low numbers. Thus we now also allow a grace amount to try to |
| 121 | + // establish this as a first line of defence to detect egregious regressions before they merge. |
| 122 | + //roundedAllowedDisruption, additionalDetails := calculateAllowedDisruptionWithGrace(*allowedDisruption) |
| 123 | + allowedDetails := []string{} |
| 124 | + allowedDetails = append(allowedDetails, fmt.Sprintf("P99 from historical data for similar jobs over past 3 weeks: %s", |
| 125 | + *allowedDisruption)) |
114 | 126 | if *allowedDisruption < 1*time.Second { |
115 | 127 | t := 1 * time.Second |
116 | 128 | allowedDisruption = &t |
117 | | - disruptionDetails = "always allow at least one second" |
| 129 | + allowedDetails = append(allowedDetails, "rounded P99 up to always allow one second") |
118 | 130 | } |
119 | 131 |
|
120 | | - disruptionDuration := disruptedIntervals.Duration(1 * time.Second) |
121 | | - roundedAllowedDisruption := allowedDisruption.Round(time.Second) |
122 | | - roundedDisruptionDuration := disruptionDuration.Round(time.Second) |
| 132 | + // Allow grace of 5s or 20%, at this layer, with one sample, we're only hoping to find really severe disruption: |
| 133 | + allowedSecs := allowedDisruption.Seconds() |
| 134 | + allowedSecsWithGrace := allowedSecs + 5.0 |
| 135 | + allowedSecsPlus20Percent := allowedSecs * 1.2 |
| 136 | + if allowedSecsPlus20Percent > allowedSecsWithGrace { |
| 137 | + allowedSecsWithGrace = allowedSecsPlus20Percent |
| 138 | + allowedDetails = append(allowedDetails, "added an additional 20% of grace") |
| 139 | + } else { |
| 140 | + allowedDetails = append(allowedDetails, "added an additional 5s of grace") |
| 141 | + } |
| 142 | + roundedFinal := int64(math.Round(allowedSecsWithGrace)) |
| 143 | + finalAllowedDisruption := time.Duration(roundedFinal) * time.Second |
123 | 144 |
|
124 | | - if roundedDisruptionDuration <= roundedAllowedDisruption { |
| 145 | + if roundedDisruptionDuration <= finalAllowedDisruption { |
125 | 146 | return &junitapi.JUnitTestCase{ |
126 | 147 | Name: testName, |
127 | 148 | } |
128 | 149 | } |
129 | 150 |
|
130 | 151 | reason := fmt.Sprintf("%v was unreachable during disruption: %v", locator.OldLocator(), disruptionDetails) |
131 | 152 | describe := disruptedIntervals.Strings() |
132 | | - failureMessage := fmt.Sprintf("%s for at least %s (maxAllowed=%s):\n\n%s", reason, roundedDisruptionDuration, roundedAllowedDisruption, strings.Join(describe, "\n")) |
| 153 | + failureMessage := fmt.Sprintf("%s for at least %s (maxAllowed=%s):\n%s\n\n%s", reason, |
| 154 | + roundedDisruptionDuration, finalAllowedDisruption, |
| 155 | + strings.Join(allowedDetails, "\n"), |
| 156 | + strings.Join(describe, "\n")) |
133 | 157 |
|
134 | 158 | return &junitapi.JUnitTestCase{ |
135 | 159 | Name: testName, |
|
0 commit comments