Skip to content

Commit a63ea4f

Browse files
authored
Merge pull request #1567 from ydb-platform/slo-chaos-testing
Slo chaos testing
2 parents 0103e08 + 980d833 commit a63ea4f

File tree

2 files changed

+40
-8
lines changed

2 files changed

+40
-8
lines changed

.github/workflows/slo.yml

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ jobs:
5454
label: xorm
5555

5656
concurrency:
57-
group: slo-${{ github.ref }}-${{matrix.sdk.name}}
57+
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
58+
cancel-in-progress: true
5859

5960
steps:
6061
- name: Checkout repository
@@ -78,17 +79,36 @@ jobs:
7879
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
7980
github_token: ${{ secrets.GITHUB_TOKEN }}
8081
sdk_name: ${{ matrix.sdk.name }}
82+
ydb_database_node_count: 5
8183

82-
- name: Run SLO Tests
84+
- name: Prepare SLO Database
8385
run: |
8486
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 create grpc://localhost:2135 /Root/testdb
87+
88+
- name: Run SLO Tests
89+
run: |
8590
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 run grpc://localhost:2135 /Root/testdb \
8691
-prom-pgw localhost:9091 \
8792
-report-period 250 \
8893
-time ${{inputs.slo_workload_duration_seconds || 600}} \
8994
-read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
9095
-write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
91-
-read-timeout 10000 \
92-
-write-timeout 10000 \
93-
-shutdown-time 30
94-
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb
96+
-read-timeout 1000 \
97+
-write-timeout 1000 || true
98+
99+
- if: always()
100+
name: Store ydb chaos testing logs
101+
run: |
102+
docker logs ydb-chaos > chaos-ydb.log
103+
104+
- if: always()
105+
uses: actions/upload-artifact@v4
106+
with:
107+
name: ${{matrix.sdk.name}}-chaos-ydb.log
108+
path: ./chaos-ydb.log
109+
retention-days: 1
110+
111+
- if: always()
112+
name: Cleanup SLO Database
113+
run: |
114+
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb || true

tests/slo/internal/metrics/metrics.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ type (
2727
operationsFailureTotal *prometheus.CounterVec
2828
operationLatencySeconds *prometheus.HistogramVec
2929

30+
retryAttempts *prometheus.GaugeVec
3031
retryAttemptsTotal *prometheus.CounterVec
3132
retriesSuccessTotal *prometheus.CounterVec
3233
retriesFailureTotal *prometheus.CounterVec
@@ -107,6 +108,14 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
107108
[]string{"operation_type", "operation_status"},
108109
)
109110

111+
m.retryAttempts = prometheus.NewGaugeVec(
112+
prometheus.GaugeOpts{
113+
Name: "sdk_retry_attempts",
114+
Help: "Current retry attempts, categorized by operation type.",
115+
},
116+
[]string{"operation_type"},
117+
)
118+
110119
m.retryAttemptsTotal = prometheus.NewCounterVec(
111120
prometheus.CounterOpts{
112121
Name: "sdk_retry_attempts_total",
@@ -147,6 +156,7 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
147156
Collector(m.operationsSuccessTotal).
148157
Collector(m.operationsFailureTotal).
149158
Collector(m.operationLatencySeconds).
159+
Collector(m.retryAttempts).
150160
Collector(m.retryAttemptsTotal).
151161
Collector(m.retriesSuccessTotal).
152162
Collector(m.retriesFailureTotal).
@@ -167,6 +177,7 @@ func (m *Metrics) Reset() error {
167177
m.operationsFailureTotal.Reset()
168178
m.operationLatencySeconds.Reset()
169179

180+
m.retryAttempts.Reset()
170181
m.retryAttemptsTotal.Reset()
171182
m.retriesSuccessTotal.Reset()
172183
m.retriesFailureTotal.Reset()
@@ -192,17 +203,18 @@ func (j Span) Finish(err error, attempts int) {
192203
latency := time.Since(j.start)
193204
j.m.pendingOperations.WithLabelValues(j.name).Sub(1)
194205

206+
j.m.retryAttempts.WithLabelValues(j.name).Set(float64(attempts))
195207
j.m.operationsTotal.WithLabelValues(j.name).Add(1)
196208
j.m.retryAttemptsTotal.WithLabelValues(j.name).Add(float64(attempts))
197209

198210
if err != nil {
199211
j.m.errorsTotal.WithLabelValues(err.Error()).Add(1)
200-
// j.m.retriesFailureTotal.WithLabelValues(j.name).Add(1)
212+
j.m.retriesFailureTotal.WithLabelValues(j.name).Add(float64(attempts))
201213
j.m.operationsFailureTotal.WithLabelValues(j.name).Add(1)
202214
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusFailue).Observe(latency.Seconds())
203215
} else {
216+
j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(float64(attempts))
204217
j.m.operationsSuccessTotal.WithLabelValues(j.name).Add(1)
205-
// j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(1)
206218
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusSuccess).Observe(latency.Seconds())
207219
}
208220
}

0 commit comments

Comments
 (0)