Allow 5s or 20% grace on per-job disruption testing

dgoodwin · dgoodwin · commit 3331f1b03566 · 2023-12-12T08:19:50.000-04:00
diff --git a/pkg/monitortestlibrary/disruptionlibrary/disruption_invariant_adapter.go b/pkg/monitortestlibrary/disruptionlibrary/disruption_invariant_adapter.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	_ "embed"
 	"fmt"
+	"math"
 	"strings"
 	"sync"
 	"time"
@@ -111,25 +112,48 @@ func createDisruptionJunit(testName string, allowedDisruption *time.Duration, di
 		}
 	}
 
+	disruptionDuration := disruptedIntervals.Duration(1 * time.Second)
+	roundedDisruptionDuration := disruptionDuration.Round(time.Second)
+
+	// Determine what amount of disruption we're willing to tolerate before we fail the test. We previously just
+	// enforced being over a P99 over the past 3 weeks, however the P99 fluctuates wildly even under these
+	// conditions, and the tests fail excessively on very low numbers. Thus we now also allow a grace amount to try to
+	// establish this as a first line of defence to detect egregious regressions before they merge.
+	//roundedAllowedDisruption, additionalDetails := calculateAllowedDisruptionWithGrace(*allowedDisruption)
+	allowedDetails := []string{}
+	allowedDetails = append(allowedDetails, fmt.Sprintf("P99 from historical data for similar jobs over past 3 weeks: %s",
+		*allowedDisruption))
 	if *allowedDisruption < 1*time.Second {
 		t := 1 * time.Second
 		allowedDisruption = &t
-		disruptionDetails = "always allow at least one second"
+		allowedDetails = append(allowedDetails, "rounded P99 up to always allow one second")
 	}
 
-	disruptionDuration := disruptedIntervals.Duration(1 * time.Second)
-	roundedAllowedDisruption := allowedDisruption.Round(time.Second)
-	roundedDisruptionDuration := disruptionDuration.Round(time.Second)
+	// Allow grace of 5s or 20%, at this layer, with one sample, we're only hoping to find really severe disruption:
+	allowedSecs := allowedDisruption.Seconds()
+	allowedSecsWithGrace := allowedSecs + 5.0
+	allowedSecsPlus20Percent := allowedSecs * 1.2
+	if allowedSecsPlus20Percent > allowedSecsWithGrace {
+		allowedSecsWithGrace = allowedSecsPlus20Percent
+		allowedDetails = append(allowedDetails, "added an additional 20% of grace")
+	} else {
+		allowedDetails = append(allowedDetails, "added an additional 5s of grace")
+	}
+	roundedFinal := int64(math.Round(allowedSecsWithGrace))
+	finalAllowedDisruption := time.Duration(roundedFinal) * time.Second
 
-	if roundedDisruptionDuration <= roundedAllowedDisruption {
+	if roundedDisruptionDuration <= finalAllowedDisruption {
 		return &junitapi.JUnitTestCase{
 			Name: testName,
 		}
 	}
 
 	reason := fmt.Sprintf("%v was unreachable during disruption: %v", locator.OldLocator(), disruptionDetails)
 	describe := disruptedIntervals.Strings()
-	failureMessage := fmt.Sprintf("%s for at least %s (maxAllowed=%s):\n\n%s", reason, roundedDisruptionDuration, roundedAllowedDisruption, strings.Join(describe, "\n"))
+	failureMessage := fmt.Sprintf("%s for at least %s (maxAllowed=%s):\n%s\n\n%s", reason,
+		roundedDisruptionDuration, finalAllowedDisruption,
+		strings.Join(allowedDetails, "\n"),
+		strings.Join(describe, "\n"))
 
 	return &junitapi.JUnitTestCase{
 		Name: testName,