Skip to content

Commit 1063504

Browse files
authored
Make kv workloads fail on user timeouts and errors (#1924)
1 parent f5f19d3 commit 1063504

File tree

8 files changed

+74
-1
lines changed

8 files changed

+74
-1
lines changed

tests/slo/database/sql/query/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ func main() {
127127
go w.Metrics(ctx, &wg, metricsRL)
128128

129129
wg.Wait()
130+
w.FailOnError()
130131
default:
131132
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
132133
}

tests/slo/database/sql/table/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ func main() {
127127
go w.Metrics(ctx, &wg, metricsRL)
128128

129129
wg.Wait()
130+
w.FailOnError()
130131
default:
131132
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
132133
}

tests/slo/internal/log/log.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ func Printf(format string, args ...any) {
1616
func Println(args ...any) {
1717
fmt.Println(append([]any{timestampPrefix()}, args...)...)
1818
}
19+
20+
func Panicf(format string, args ...any) {
21+
panic(fmt.Sprintf(timestampPrefix()+format, args...))
22+
}

tests/slo/internal/metrics/metrics.go

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
package metrics
22

33
import (
4+
"context"
5+
"errors"
46
"fmt"
57
"time"
68

79
"github.com/prometheus/client_golang/prometheus"
810
"github.com/prometheus/client_golang/prometheus/push"
11+
dto "github.com/prometheus/client_model/go"
912
"github.com/ydb-platform/ydb-go-sdk/v3"
13+
14+
"slo/internal/log"
1015
)
1116

1217
const (
@@ -20,7 +25,8 @@ type (
2025
ref string
2126
label string
2227

23-
errorsTotal *prometheus.CounterVec
28+
errorsTotal *prometheus.CounterVec
29+
timeoutsTotal *prometheus.CounterVec
2430

2531
operationsTotal *prometheus.CounterVec
2632
operationsSuccessTotal *prometheus.CounterVec
@@ -60,6 +66,13 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
6066
},
6167
[]string{"error_type"},
6268
)
69+
m.timeoutsTotal = prometheus.NewCounterVec(
70+
prometheus.CounterOpts{
71+
Name: "sdk_timeouts_total",
72+
Help: "Total number of timeout errors",
73+
},
74+
[]string{},
75+
)
6376

6477
m.operationsTotal = prometheus.NewCounterVec(
6578
prometheus.CounterOpts{
@@ -208,6 +221,9 @@ func (j Span) Finish(err error, attempts int) {
208221
j.m.retryAttemptsTotal.WithLabelValues(j.name).Add(float64(attempts))
209222

210223
if err != nil {
224+
if errors.Is(err, context.DeadlineExceeded) {
225+
j.m.timeoutsTotal.WithLabelValues().Add(1)
226+
}
211227
j.m.errorsTotal.WithLabelValues(err.Error()).Add(1)
212228
j.m.retriesFailureTotal.WithLabelValues(j.name).Add(float64(attempts))
213229
j.m.operationsFailureTotal.WithLabelValues(j.name).Add(1)
@@ -218,3 +234,47 @@ func (j Span) Finish(err error, attempts int) {
218234
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusSuccess).Observe(latency.Seconds())
219235
}
220236
}
237+
238+
func getCounterVecTotal(counterVec *prometheus.CounterVec) float64 {
239+
ch := make(chan prometheus.Metric, 100)
240+
go func() {
241+
counterVec.Collect(ch)
242+
close(ch)
243+
}()
244+
245+
var total float64
246+
for m := range ch {
247+
pb := &dto.Metric{}
248+
_ = m.Write(pb)
249+
if pb.GetCounter() != nil {
250+
total += pb.GetCounter().GetValue()
251+
}
252+
}
253+
254+
return total
255+
}
256+
257+
func (m *Metrics) OperationsTotal() float64 {
258+
return getCounterVecTotal(m.operationsTotal)
259+
}
260+
261+
func (m *Metrics) ErrorsTotal() float64 {
262+
return getCounterVecTotal(m.errorsTotal)
263+
}
264+
265+
func (m *Metrics) TimeoutsTotal() float64 {
266+
return getCounterVecTotal(m.timeoutsTotal)
267+
}
268+
269+
func (m *Metrics) FailOnError() {
270+
if m.ErrorsTotal() > 0 {
271+
log.Panicf(
272+
"unretriable (or not successfully retried) errors: %.0f errors out of %.0f operations",
273+
m.ErrorsTotal(),
274+
m.OperationsTotal(),
275+
)
276+
}
277+
if m.TimeoutsTotal() > 0 {
278+
log.Panicf("there are user timeouts: %.0f timeouts", m.TimeoutsTotal())
279+
}
280+
}

tests/slo/internal/workers/workers.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ func New(cfg *config.Config, s ReadWriter, ref, label, jobName string) (*Workers
3535
}, nil
3636
}
3737

38+
func (w *Workers) FailOnError() {
39+
w.m.FailOnError()
40+
}
41+
3842
func (w *Workers) Close() error {
3943
return w.m.Reset()
4044
}

tests/slo/native/query/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ func main() {
143143
go w.Metrics(ctx, &wg, metricsRL)
144144

145145
wg.Wait()
146+
w.FailOnError()
146147
default:
147148
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
148149
}

tests/slo/native/table/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ func main() {
127127
go w.Metrics(ctx, &wg, metricsRL)
128128

129129
wg.Wait()
130+
w.FailOnError()
130131
default:
131132
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
132133
}

tests/slo/native/table/over/query/service/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ func main() {
127127
go w.Metrics(ctx, &wg, metricsRL)
128128

129129
wg.Wait()
130+
w.FailOnError()
130131
default:
131132
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
132133
}

0 commit comments

Comments
 (0)