Skip to content

Commit 593ef60

Browse files
committed
ci: add chaos testing
1 parent 17a901b commit 593ef60

File tree

3 files changed

+77
-8
lines changed

3 files changed

+77
-8
lines changed

.github/scripts/chaos-ydb.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/bin/bash
2+
3+
sleep 30
4+
5+
chaos_ydb_dynamic_containers() {
6+
# Set the end time to 3 minutes (180 seconds) from the start
7+
end_time=$((SECONDS + 180))
8+
pattern="ydb-dynamic"
9+
10+
while [ $SECONDS -lt $end_time ]; do
11+
signal=${1:-"SIGTERM"}
12+
time=${2:-"5"}
13+
14+
containers=$(docker ps --filter "name=$pattern" -q)
15+
16+
for container_id in $containers; do
17+
echo "Restarting container with signal $signal: $container_id"
18+
docker restart --signal $signal --time $time "$container_id"
19+
20+
sleep 30
21+
done
22+
done
23+
}
24+
25+
chaos_ydb_dynamic_containers "SIGTERM";
26+
27+
chaos_ydb_dynamic_containers "SIGINT";
28+
29+
chaos_ydb_dynamic_containers "SIGKILL" 0;

.github/workflows/slo.yml

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ jobs:
5454
label: xorm
5555

5656
concurrency:
57-
group: slo-${{ github.ref }}-${{matrix.sdk.name}}
57+
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
58+
cancel-in-progress: true
5859

5960
steps:
6061
- name: Checkout repository
@@ -78,17 +79,44 @@ jobs:
7879
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
7980
github_token: ${{ secrets.GITHUB_TOKEN }}
8081
sdk_name: ${{ matrix.sdk.name }}
82+
ydb_database_node_count: 5
8183

82-
- name: Run SLO Tests
84+
- name: Prepare SLO Database
8385
run: |
8486
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 create grpc://localhost:2135 /Root/testdb
87+
88+
- name: Chaos YDB
89+
run: |
90+
chmod +x ./.github/scripts/chaos-ydb.sh
91+
nohup ./.github/scripts/chaos-ydb.sh > chaos-ydb.log 2>&1 &
92+
93+
# - name: Chaos Network
94+
# run: |
95+
# sudo tc qdisc add dev lo root netem delay 100ms 50ms loss 5% corrupt 1%
96+
97+
- name: Run SLO Tests
98+
run: |
8599
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 run grpc://localhost:2135 /Root/testdb \
86100
-prom-pgw localhost:9091 \
87101
-report-period 250 \
88102
-time ${{inputs.slo_workload_duration_seconds || 600}} \
89103
-read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
90104
-write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
91-
-read-timeout 10000 \
92-
-write-timeout 10000 \
93-
-shutdown-time 30
94-
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb
105+
-read-timeout 1000 \
106+
-write-timeout 1000 || true
107+
108+
# - if: always()
109+
# run: |
110+
# sudo tc qdisc del dev lo root
111+
112+
- if: always()
113+
uses: actions/upload-artifact@v4
114+
with:
115+
name: ${{matrix.sdk.name}}-chaos-ydb.log
116+
path: ./chaos-ydb.log
117+
retention-days: 1
118+
119+
- if: always()
120+
name: Cleanup SLO Database
121+
run: |
122+
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb || true

tests/slo/internal/metrics/metrics.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ type (
2727
operationsFailureTotal *prometheus.CounterVec
2828
operationLatencySeconds *prometheus.HistogramVec
2929

30+
retryAttempts *prometheus.GaugeVec
3031
retryAttemptsTotal *prometheus.CounterVec
3132
retriesSuccessTotal *prometheus.CounterVec
3233
retriesFailureTotal *prometheus.CounterVec
@@ -107,6 +108,14 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
107108
[]string{"operation_type", "operation_status"},
108109
)
109110

111+
m.retryAttempts = prometheus.NewGaugeVec(
112+
prometheus.GaugeOpts{
113+
Name: "sdk_retry_attempts",
114+
Help: "Current retry attempts, categorized by operation type.",
115+
},
116+
[]string{"operation_type"},
117+
)
118+
110119
m.retryAttemptsTotal = prometheus.NewCounterVec(
111120
prometheus.CounterOpts{
112121
Name: "sdk_retry_attempts_total",
@@ -147,6 +156,7 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
147156
Collector(m.operationsSuccessTotal).
148157
Collector(m.operationsFailureTotal).
149158
Collector(m.operationLatencySeconds).
159+
Collector(m.retryAttempts).
150160
Collector(m.retryAttemptsTotal).
151161
Collector(m.retriesSuccessTotal).
152162
Collector(m.retriesFailureTotal).
@@ -167,6 +177,7 @@ func (m *Metrics) Reset() error {
167177
m.operationsFailureTotal.Reset()
168178
m.operationLatencySeconds.Reset()
169179

180+
m.retryAttempts.Reset()
170181
m.retryAttemptsTotal.Reset()
171182
m.retriesSuccessTotal.Reset()
172183
m.retriesFailureTotal.Reset()
@@ -192,17 +203,18 @@ func (j Span) Finish(err error, attempts int) {
192203
latency := time.Since(j.start)
193204
j.m.pendingOperations.WithLabelValues(j.name).Sub(1)
194205

206+
j.m.retryAttempts.WithLabelValues(j.name).Set(float64(attempts))
195207
j.m.operationsTotal.WithLabelValues(j.name).Add(1)
196208
j.m.retryAttemptsTotal.WithLabelValues(j.name).Add(float64(attempts))
197209

198210
if err != nil {
199211
j.m.errorsTotal.WithLabelValues(err.Error()).Add(1)
200-
// j.m.retriesFailureTotal.WithLabelValues(j.name).Add(1)
212+
j.m.retriesFailureTotal.WithLabelValues(j.name).Add(float64(attempts))
201213
j.m.operationsFailureTotal.WithLabelValues(j.name).Add(1)
202214
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusFailue).Observe(latency.Seconds())
203215
} else {
216+
j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(float64(attempts))
204217
j.m.operationsSuccessTotal.WithLabelValues(j.name).Add(1)
205-
// j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(1)
206218
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusSuccess).Observe(latency.Seconds())
207219
}
208220
}

0 commit comments

Comments
 (0)