add explanation why p95 of direct and loki might not be the same

Tofel · Tofel · commit b847867edea4 · 2024-12-13T14:39:08.000+01:00
diff --git a/book/src/libs/wasp/benchspy/loki_dillema.md b/book/src/libs/wasp/benchspy/loki_dillema.md
@@ -20,3 +20,16 @@ This means you can:
 - Avoid calculating metrics like the median, 95th percentile latency, or error ratio yourself.
 
 By using `Direct`, you save resources and simplify the process when advanced analysis isn't required.
+
+> [!WARNING]
+> Metrics calculated by the two query executors may differ slightly due to differences in their data processing and calculation methods:
+> - **`Direct` QueryExecutor**: This method processes all individual data points from the raw dataset, ensuring that every value is taken into account for calculations like averages, percentiles, or other statistics. It provides the most granular and precise results but may also be more sensitive to outliers and noise in the data.
+> - **`Loki` QueryExecutor**: This method aggregates data using a default window size of 10 seconds. Within each window, multiple raw data points are combined (e.g., through averaging, summing, or other aggregation functions), which reduces the granularity of the dataset. While this approach can improve performance and reduce noise, it also smooths the data, which may obscure outliers or small-scale variability.
+
+> #### Why This Matters for Percentiles:
+> Percentiles, such as the 95th percentile (p95), are particularly sensitive to the granularity of the input data:
+> - In the **`Direct` QueryExecutor**, the p95 is calculated across all raw data points, capturing the true variability of the dataset, including any extreme values or spikes.
+> - In the **`Loki` QueryExecutor**, the p95 is calculated over aggregated data (i.e. using the 10-second window). As a result, the raw values within each window are smoothed into a single representative value, potentially lowering or altering the calculated p95. For example, an outlier that would significantly affect the p95 in the `Direct` calculation might be averaged out in the `Loki` window, leading to a slightly lower percentile value.
+
+> #### Key Takeaway:
+> The difference arises because `Direct` prioritizes precision by using raw data, while `Loki` prioritizes efficiency and scalability by using aggregated data. When interpreting results, it’s essential to consider how the smoothing effect of `Loki` might impact the representation of variability or extremes in the dataset. This is especially important for metrics like percentiles, where such details can significantly influence the outcome.
diff --git a/book/src/libs/wasp/benchspy/loki_std.md b/book/src/libs/wasp/benchspy/loki_std.md
@@ -105,6 +105,10 @@ compareMedian(string(benchspy.Percentile95Latency))
 compareMedian(string(benchspy.ErrorRate))
 ```
 
+> [!WARNING]
+> Standard Loki metrics are all calculated using a 10 seconds moving window, which results in smoothing of values due to aggregation.
+> To learn what that means in details, please refer to [To Loki or Not to Loki](./loki_dillema.md) chapter.
+
 ## What’s Next?
 
 In this example, we used standard metrics, which are the same as in the first test. Now, [let’s explore how to use your custom LogQL queries](./loki_custom.md).
diff --git a/wasp/benchspy/direct.go b/wasp/benchspy/direct.go
@@ -154,7 +154,9 @@ func (g *DirectQueryExecutor) standardQuery(standardMetric StandardLoadMetric) (
 		medianFn := func(responses *wasp.SliceBuffer[wasp.Response]) (float64, error) {
 			var asMiliDuration []float64
 			for _, response := range responses.Data {
-				asMiliDuration = append(asMiliDuration, float64(response.Duration.Milliseconds()))
+				// get duration as nanoseconds and convert to milliseconds in order to not lose precision
+				// otherwise, the duration will be rounded to the nearest millisecond
+				asMiliDuration = append(asMiliDuration, float64(response.Duration.Nanoseconds())/1_000_000)
 			}
 
 			return CalculatePercentile(asMiliDuration, 0.5), nil
@@ -164,7 +166,9 @@ func (g *DirectQueryExecutor) standardQuery(standardMetric StandardLoadMetric) (
 		p95Fn := func(responses *wasp.SliceBuffer[wasp.Response]) (float64, error) {
 			var asMiliDuration []float64
 			for _, response := range responses.Data {
-				asMiliDuration = append(asMiliDuration, float64(response.Duration.Milliseconds()))
+				// get duration as nanoseconds and convert to milliseconds in order to not lose precision
+				// otherwise, the duration will be rounded to the nearest millisecond
+				asMiliDuration = append(asMiliDuration, float64(response.Duration.Nanoseconds())/1_000_000)
 			}
 
 			return CalculatePercentile(asMiliDuration, 0.95), nil
diff --git a/wasp/examples/benchspy/two_query_executors/loki_and_direct_comparison_test.go b/wasp/examples/benchspy/two_query_executors/loki_and_direct_comparison_test.go
@@ -0,0 +1,93 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/smartcontractkit/chainlink-testing-framework/wasp"
+	"github.com/smartcontractkit/chainlink-testing-framework/wasp/benchspy"
+)
+
+// both should give the same results
+func TestBenchSpy_Standard_Direct_And_Loki_Metrics(t *testing.T) {
+	// this test requires CTFv2 node_set with observability stack to be running
+
+	label := "benchspy-direct-loki"
+
+	gen, err := wasp.NewGenerator(&wasp.Config{
+		T:           t,
+		GenName:     "vu",
+		CallTimeout: 100 * time.Millisecond,
+		LoadType:    wasp.VU,
+		Schedule:    wasp.Plain(1, 10*time.Second),
+		VU: wasp.NewMockVU(&wasp.MockVirtualUserConfig{
+			CallSleep: 50 * time.Millisecond,
+		}),
+		Labels: map[string]string{
+			"branch": label,
+			"commit": label,
+		},
+		LokiConfig: wasp.NewEnvLokiConfig(),
+	})
+	require.NoError(t, err)
+
+	gen.Run(true)
+
+	baseLineReport, err := benchspy.NewStandardReport(
+		"91ee9e3c903d52de12f3d0c1a07ac3c2a6d141fb",
+		benchspy.WithStandardQueries(benchspy.StandardQueryExecutor_Direct, benchspy.StandardQueryExecutor_Loki),
+		benchspy.WithGenerators(gen),
+	)
+	require.NoError(t, err, "failed to create original report")
+
+	fetchCtx, cancelFn := context.WithTimeout(context.Background(), 60*time.Second)
+	defer cancelFn()
+
+	fetchErr := baseLineReport.FetchData(fetchCtx)
+	require.NoError(t, fetchErr, "failed to fetch current report")
+
+	currentAsLokiSlices := benchspy.MustAllLokiResults(baseLineReport)
+	currentAsDirectFloats := benchspy.MustAllDirectResults(baseLineReport)
+
+	require.NotEmpty(t, currentAsLokiSlices[string(benchspy.MedianLatency)], "%s results were missing for loki", string(benchspy.MedianLatency))
+	require.NotEmpty(t, currentAsDirectFloats[string(benchspy.MedianLatency)], "%s results were missing for direct", string(benchspy.MedianLatency))
+
+	var compareValues = func(t *testing.T, metricName string, lokiFloat, directFloat, maxDiffPrecentage float64) {
+		var diffPrecentage float64
+		if lokiFloat != 0.0 && directFloat != 0.0 {
+			diffPrecentage = (directFloat - lokiFloat) / lokiFloat * 100
+		} else if lokiFloat == 0.0 && directFloat == 0.0 {
+			diffPrecentage = 0.0
+		} else {
+			diffPrecentage = 100.0
+		}
+		assert.LessOrEqual(t, math.Abs(diffPrecentage), maxDiffPrecentage, "%s are more than 1% different", metricName, fmt.Sprintf("%.4f", diffPrecentage))
+	}
+
+	lokiFloatSlice, err := benchspy.StringSliceToFloat64Slice(currentAsLokiSlices[string(benchspy.MedianLatency)])
+	require.NoError(t, err, "failed to convert %s results to float64 slice", string(benchspy.MedianLatency))
+	lokiMedian := benchspy.CalculatePercentile(lokiFloatSlice, 0.5)
+
+	compareValues(t, string(benchspy.MedianLatency), lokiMedian, currentAsDirectFloats[string(benchspy.MedianLatency)], 1.0)
+
+	lokip95 := benchspy.CalculatePercentile(lokiFloatSlice, 0.95)
+	// here the max diff is 1.5% because of higher impact of data aggregation in loki
+	compareValues(t, string(benchspy.Percentile95Latency), lokip95, currentAsDirectFloats[string(benchspy.Percentile95Latency)], 1.5)
+
+	lokiErrorRate := 0
+	for _, v := range currentAsLokiSlices[string(benchspy.ErrorRate)] {
+		asInt, err := strconv.Atoi(v)
+		require.NoError(t, err)
+		lokiErrorRate += int(asInt)
+	}
+
+	lokiErrorRate = lokiErrorRate / len(currentAsLokiSlices[string(benchspy.ErrorRate)])
+	compareValues(t, string(benchspy.ErrorRate), float64(lokiErrorRate), currentAsDirectFloats[string(benchspy.ErrorRate)], 1.0)
+}