Skip to content

Commit b847867

Browse files
committed
add explanation why p95 of direct and loki might not be the same
1 parent 73df456 commit b847867

File tree

4 files changed

+116
-2
lines changed

4 files changed

+116
-2
lines changed

book/src/libs/wasp/benchspy/loki_dillema.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,16 @@ This means you can:
2020
- Avoid calculating metrics like the median, 95th percentile latency, or error ratio yourself.
2121

2222
By using `Direct`, you save resources and simplify the process when advanced analysis isn't required.
23+
24+
> [!WARNING]
25+
> Metrics calculated by the two query executors may differ slightly due to differences in their data processing and calculation methods:
26+
> - **`Direct` QueryExecutor**: This method processes all individual data points from the raw dataset, ensuring that every value is taken into account for calculations like averages, percentiles, or other statistics. It provides the most granular and precise results but may also be more sensitive to outliers and noise in the data.
27+
> - **`Loki` QueryExecutor**: This method aggregates data using a default window size of 10 seconds. Within each window, multiple raw data points are combined (e.g., through averaging, summing, or other aggregation functions), which reduces the granularity of the dataset. While this approach can improve performance and reduce noise, it also smooths the data, which may obscure outliers or small-scale variability.
28+
29+
> #### Why This Matters for Percentiles:
30+
> Percentiles, such as the 95th percentile (p95), are particularly sensitive to the granularity of the input data:
31+
> - In the **`Direct` QueryExecutor**, the p95 is calculated across all raw data points, capturing the true variability of the dataset, including any extreme values or spikes.
32+
> - In the **`Loki` QueryExecutor**, the p95 is calculated over aggregated data (i.e. using the 10-second window). As a result, the raw values within each window are smoothed into a single representative value, potentially lowering or altering the calculated p95. For example, an outlier that would significantly affect the p95 in the `Direct` calculation might be averaged out in the `Loki` window, leading to a slightly lower percentile value.
33+
34+
> #### Key Takeaway:
35+
> The difference arises because `Direct` prioritizes precision by using raw data, while `Loki` prioritizes efficiency and scalability by using aggregated data. When interpreting results, it’s essential to consider how the smoothing effect of `Loki` might impact the representation of variability or extremes in the dataset. This is especially important for metrics like percentiles, where such details can significantly influence the outcome.

book/src/libs/wasp/benchspy/loki_std.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ compareMedian(string(benchspy.Percentile95Latency))
105105
compareMedian(string(benchspy.ErrorRate))
106106
```
107107

108+
> [!WARNING]
109+
> Standard Loki metrics are all calculated using a 10 seconds moving window, which results in smoothing of values due to aggregation.
110+
> To learn what that means in details, please refer to [To Loki or Not to Loki](./loki_dillema.md) chapter.
111+
108112
## What’s Next?
109113

110114
In this example, we used standard metrics, which are the same as in the first test. Now, [let’s explore how to use your custom LogQL queries](./loki_custom.md).

wasp/benchspy/direct.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ func (g *DirectQueryExecutor) standardQuery(standardMetric StandardLoadMetric) (
154154
medianFn := func(responses *wasp.SliceBuffer[wasp.Response]) (float64, error) {
155155
var asMiliDuration []float64
156156
for _, response := range responses.Data {
157-
asMiliDuration = append(asMiliDuration, float64(response.Duration.Milliseconds()))
157+
// get duration as nanoseconds and convert to milliseconds in order to not lose precision
158+
// otherwise, the duration will be rounded to the nearest millisecond
159+
asMiliDuration = append(asMiliDuration, float64(response.Duration.Nanoseconds())/1_000_000)
158160
}
159161

160162
return CalculatePercentile(asMiliDuration, 0.5), nil
@@ -164,7 +166,9 @@ func (g *DirectQueryExecutor) standardQuery(standardMetric StandardLoadMetric) (
164166
p95Fn := func(responses *wasp.SliceBuffer[wasp.Response]) (float64, error) {
165167
var asMiliDuration []float64
166168
for _, response := range responses.Data {
167-
asMiliDuration = append(asMiliDuration, float64(response.Duration.Milliseconds()))
169+
// get duration as nanoseconds and convert to milliseconds in order to not lose precision
170+
// otherwise, the duration will be rounded to the nearest millisecond
171+
asMiliDuration = append(asMiliDuration, float64(response.Duration.Nanoseconds())/1_000_000)
168172
}
169173

170174
return CalculatePercentile(asMiliDuration, 0.95), nil
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"math"
7+
"strconv"
8+
"testing"
9+
"time"
10+
11+
"github.com/stretchr/testify/assert"
12+
"github.com/stretchr/testify/require"
13+
14+
"github.com/smartcontractkit/chainlink-testing-framework/wasp"
15+
"github.com/smartcontractkit/chainlink-testing-framework/wasp/benchspy"
16+
)
17+
18+
// both should give the same results
19+
func TestBenchSpy_Standard_Direct_And_Loki_Metrics(t *testing.T) {
20+
// this test requires CTFv2 node_set with observability stack to be running
21+
22+
label := "benchspy-direct-loki"
23+
24+
gen, err := wasp.NewGenerator(&wasp.Config{
25+
T: t,
26+
GenName: "vu",
27+
CallTimeout: 100 * time.Millisecond,
28+
LoadType: wasp.VU,
29+
Schedule: wasp.Plain(1, 10*time.Second),
30+
VU: wasp.NewMockVU(&wasp.MockVirtualUserConfig{
31+
CallSleep: 50 * time.Millisecond,
32+
}),
33+
Labels: map[string]string{
34+
"branch": label,
35+
"commit": label,
36+
},
37+
LokiConfig: wasp.NewEnvLokiConfig(),
38+
})
39+
require.NoError(t, err)
40+
41+
gen.Run(true)
42+
43+
baseLineReport, err := benchspy.NewStandardReport(
44+
"91ee9e3c903d52de12f3d0c1a07ac3c2a6d141fb",
45+
benchspy.WithStandardQueries(benchspy.StandardQueryExecutor_Direct, benchspy.StandardQueryExecutor_Loki),
46+
benchspy.WithGenerators(gen),
47+
)
48+
require.NoError(t, err, "failed to create original report")
49+
50+
fetchCtx, cancelFn := context.WithTimeout(context.Background(), 60*time.Second)
51+
defer cancelFn()
52+
53+
fetchErr := baseLineReport.FetchData(fetchCtx)
54+
require.NoError(t, fetchErr, "failed to fetch current report")
55+
56+
currentAsLokiSlices := benchspy.MustAllLokiResults(baseLineReport)
57+
currentAsDirectFloats := benchspy.MustAllDirectResults(baseLineReport)
58+
59+
require.NotEmpty(t, currentAsLokiSlices[string(benchspy.MedianLatency)], "%s results were missing for loki", string(benchspy.MedianLatency))
60+
require.NotEmpty(t, currentAsDirectFloats[string(benchspy.MedianLatency)], "%s results were missing for direct", string(benchspy.MedianLatency))
61+
62+
var compareValues = func(t *testing.T, metricName string, lokiFloat, directFloat, maxDiffPrecentage float64) {
63+
var diffPrecentage float64
64+
if lokiFloat != 0.0 && directFloat != 0.0 {
65+
diffPrecentage = (directFloat - lokiFloat) / lokiFloat * 100
66+
} else if lokiFloat == 0.0 && directFloat == 0.0 {
67+
diffPrecentage = 0.0
68+
} else {
69+
diffPrecentage = 100.0
70+
}
71+
assert.LessOrEqual(t, math.Abs(diffPrecentage), maxDiffPrecentage, "%s are more than 1% different", metricName, fmt.Sprintf("%.4f", diffPrecentage))
72+
}
73+
74+
lokiFloatSlice, err := benchspy.StringSliceToFloat64Slice(currentAsLokiSlices[string(benchspy.MedianLatency)])
75+
require.NoError(t, err, "failed to convert %s results to float64 slice", string(benchspy.MedianLatency))
76+
lokiMedian := benchspy.CalculatePercentile(lokiFloatSlice, 0.5)
77+
78+
compareValues(t, string(benchspy.MedianLatency), lokiMedian, currentAsDirectFloats[string(benchspy.MedianLatency)], 1.0)
79+
80+
lokip95 := benchspy.CalculatePercentile(lokiFloatSlice, 0.95)
81+
// here the max diff is 1.5% because of higher impact of data aggregation in loki
82+
compareValues(t, string(benchspy.Percentile95Latency), lokip95, currentAsDirectFloats[string(benchspy.Percentile95Latency)], 1.5)
83+
84+
lokiErrorRate := 0
85+
for _, v := range currentAsLokiSlices[string(benchspy.ErrorRate)] {
86+
asInt, err := strconv.Atoi(v)
87+
require.NoError(t, err)
88+
lokiErrorRate += int(asInt)
89+
}
90+
91+
lokiErrorRate = lokiErrorRate / len(currentAsLokiSlices[string(benchspy.ErrorRate)])
92+
compareValues(t, string(benchspy.ErrorRate), float64(lokiErrorRate), currentAsDirectFloats[string(benchspy.ErrorRate)], 1.0)
93+
}

0 commit comments

Comments
 (0)