Skip to content

Commit fc824ef

Browse files
committed
ci: migrate to new slo action
1 parent cea4669 commit fc824ef

File tree

11 files changed

+223
-195
lines changed

11 files changed

+223
-195
lines changed

.github/workflows/slo.yml

Lines changed: 58 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,76 @@
1-
name: SLO
1+
name: slo
22

33
on:
4+
push:
5+
branches:
6+
- main
47
pull_request:
5-
branches: [main]
8+
branches:
9+
- main
10+
- release-*
611
workflow_dispatch:
12+
inputs:
13+
github_pull_request_number:
14+
required: true
15+
slo_workload_duration_seconds:
16+
default: '600'
17+
required: false
18+
slo_workload_read_max_rps:
19+
default: '1000'
20+
required: false
21+
slo_workload_write_max_rps:
22+
default: '100'
23+
required: false
724

825
jobs:
9-
test-slo:
10-
concurrency:
11-
group: slo-${{ github.ref }}
26+
ydb-slo-action-init:
1227
if: (!contains(github.event.pull_request.labels.*.name, 'no slo'))
1328

29+
name: Run YDB SLO Tests
1430
runs-on: ubuntu-latest
15-
name: SLO test
16-
permissions:
17-
checks: write
18-
pull-requests: write
19-
contents: read
20-
issues: write
31+
32+
strategy:
33+
matrix:
34+
sdk:
35+
- py-sync-table
36+
- py-sync-query
2137

2238
steps:
2339
- name: Checkout repository
24-
uses: actions/checkout@v3
25-
if: env.DOCKER_REPO != null
26-
env:
27-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
40+
uses: actions/checkout@v4
2841

29-
- name: Run SLO
30-
uses: ydb-platform/slo-tests@main
31-
if: env.DOCKER_REPO != null
32-
env:
33-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
34-
continue-on-error: true
42+
- name: Install Python3
43+
uses: actions/setup-python@v5
3544
with:
36-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
37-
KUBECONFIG_B64: ${{ secrets.SLO_KUBE_CONFIG }}
38-
AWS_CREDENTIALS_B64: ${{ secrets.SLO_AWS_CREDENTIALS }}
39-
AWS_CONFIG_B64: ${{ secrets.SLO_AWS_CONFIG }}
40-
DOCKER_USERNAME: ${{ secrets.SLO_DOCKER_USERNAME }}
41-
DOCKER_PASSWORD: ${{ secrets.SLO_DOCKER_PASSWORD }}
42-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
43-
DOCKER_FOLDER: ${{ secrets.SLO_DOCKER_FOLDER }}
44-
s3_endpoint: ${{ secrets.SLO_S3_ENDPOINT }}
45-
s3_images_folder: ${{ vars.SLO_S3_IMAGES_FOLDER }}
46-
grafana_domain: ${{ vars.SLO_GRAFANA_DOMAIN }}
47-
grafana_dashboard: ${{ vars.SLO_GRAFANA_DASHBOARD }}
48-
ydb_version: 'newest'
49-
timeBetweenPhases: 30
50-
shutdownTime: 30
45+
python-version: '3.8'
46+
cache: 'pip'
5147

52-
language_id0: sync-python-table
53-
language0: Python SDK over Table Service
54-
workload_path0: tests/slo
55-
workload_build_context0: ../..
56-
workload_build_options0: -f Dockerfile --build-arg SDK_SERVICE=sync-python-table
48+
- name: Install dependencies
49+
run: |
50+
python -m pip install --no-cache-dir --upgrade pip
51+
python -m pip install --no-cache-dir -e .
52+
python -m pip install --no-cache-dir -r tests/slo/requirements.txt
5753
58-
language_id1: sync-python-query
59-
language1: Python SDK over Query Service
60-
workload_path1: tests/slo
61-
workload_build_context1: ../..
62-
workload_build_options1: -f Dockerfile --build-arg SDK_SERVICE=sync-python-query
54+
- name: Initialize YDB SLO
55+
uses: ydb-platform/ydb-slo-action/init@main
56+
with:
57+
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
58+
github_token: ${{ secrets.GITHUB_TOKEN }}
59+
sdk_name: ${{ matrix.sdk }}
6360

64-
- uses: actions/upload-artifact@v3
65-
if: env.DOCKER_REPO != null
61+
- name: Run SLO Tests
6662
env:
67-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
68-
with:
69-
name: slo-logs
70-
path: logs/
63+
REF: '${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}'
64+
SDK_SERVICE: '${{ matrix.sdk }}'
65+
run: |
66+
python ./tests/slo create grpc://localhost:2135 /Root/testdb
67+
python ./tests/slo run grpc://localhost:2135 /Root/testdb \
68+
--prom-pgw localhost:9091 \
69+
--report-period 250 \
70+
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
71+
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
72+
--read-timeout 10000 \
73+
--write-timeout 10000 \
74+
--time ${{inputs.slo_workload_duration_seconds || 600}} \
75+
--shutdown-time 30
76+
python ./tests/slo cleanup grpc://localhost:2135 /Root/testdb

tests/slo/Dockerfile

Lines changed: 0 additions & 11 deletions
This file was deleted.
File renamed without changes.
File renamed without changes.
File renamed without changes.

tests/slo/src/jobs.py renamed to tests/slo/jobs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import threading
1010

11-
from metrics import Metrics, JOB_WRITE_LABEL, JOB_READ_LABEL
11+
from metrics import Metrics, OP_TYPE_WRITE, OP_TYPE_READ
1212

1313
from generator import RowGenerator
1414

@@ -106,7 +106,7 @@ def check_result(result):
106106
query=query,
107107
params=params,
108108
metrics=metrics,
109-
labels=(JOB_READ_LABEL,),
109+
labels=(OP_TYPE_READ,),
110110
request_settings=request_settings,
111111
retry_settings=retry_setting,
112112
check_result_cb=check_result,
@@ -163,7 +163,7 @@ def check_result(result):
163163
query=query,
164164
params=params,
165165
metrics=metrics,
166-
labels=(JOB_READ_LABEL,),
166+
labels=(OP_TYPE_READ,),
167167
request_settings=request_settings,
168168
retry_settings=retry_setting,
169169
check_result_cb=check_result,
@@ -220,7 +220,7 @@ def run_writes(driver, query, row_generator, metrics, limiter, runtime, timeout)
220220
query=query,
221221
params=params,
222222
metrics=metrics,
223-
labels=(JOB_WRITE_LABEL,),
223+
labels=(OP_TYPE_WRITE,),
224224
request_settings=request_settings,
225225
retry_settings=retry_setting,
226226
)
@@ -285,7 +285,7 @@ def check_result(result):
285285
query=query,
286286
params=params,
287287
metrics=metrics,
288-
labels=(JOB_WRITE_LABEL,),
288+
labels=(OP_TYPE_WRITE,),
289289
request_settings=request_settings,
290290
retry_settings=retry_setting,
291291
check_result_cb=check_result,

tests/slo/metrics.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import time
2+
from contextlib import contextmanager
3+
from importlib.metadata import version
4+
from collections.abc import Iterable
5+
6+
from os import environ
7+
8+
environ["PROMETHEUS_DISABLE_CREATED_SERIES"] = "True"
9+
10+
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, push_to_gateway # noqa: E402
11+
12+
OP_TYPE_READ, OP_TYPE_WRITE = "read", "write"
13+
OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err"
14+
15+
REF = environ.get("REF", "main")
16+
SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "py-sync-table")
17+
18+
19+
class Metrics:
20+
def __init__(self, push_gateway):
21+
self._push_gtw = push_gateway
22+
self._registry = CollectorRegistry()
23+
self._metrics = dict(
24+
errors_total=Counter(
25+
"sdk_errors_total",
26+
"Total number of errors encountered, categorized by error type.",
27+
labelnames=("operation_type","error_type"),
28+
registry=self._registry,
29+
),
30+
operations_total=Counter(
31+
"sdk_operations_total",
32+
"Total number of operations, categorized by type attempted by the SDK.",
33+
labelnames=("operation_type",),
34+
registry=self._registry,
35+
),
36+
operations_success_total=Counter(
37+
"sdk_operations_success_total",
38+
"Total number of successful operations, categorized by type.",
39+
labelnames=("operation_type",),
40+
registry=self._registry,
41+
),
42+
operations_failure_total=Counter(
43+
"sdk_operations_failure_total",
44+
"Total number of failed operations, categorized by type.",
45+
labelnames=("operation_type",),
46+
registry=self._registry,
47+
),
48+
operation_latency_seconds=Histogram(
49+
"sdk_operation_latency_seconds",
50+
"Latency of operations performed by the SDK in seconds, categorized by type and status.",
51+
labelnames=("operation_type","operation_status",),
52+
registry=self._registry,
53+
buckets=(
54+
0.001, # 1 ms
55+
0.002, # 2 ms
56+
0.003, # 3 ms
57+
0.004, # 4 ms
58+
0.005, # 5 ms
59+
0.0075,# 7.5 ms
60+
0.010, # 10 ms
61+
0.020, # 20 ms
62+
0.050, # 50 ms
63+
0.100, # 100 ms
64+
0.200, # 200 ms
65+
0.500, # 500 ms
66+
1.000, # 1 s
67+
)
68+
),
69+
retry_attempts_total=Counter(
70+
"sdk_retry_attempts_total",
71+
"Total number of retry attempts, categorized by operation type.",
72+
labelnames=("operation_type",),
73+
registry=self._registry,
74+
),
75+
retries_success_total=Counter(
76+
"sdk_retries_success_total",
77+
"Total number of successful retries, categorized by operation type.",
78+
labelnames=("operation_type",),
79+
registry=self._registry,
80+
),
81+
retries_failure_total=Counter(
82+
"sdk_retries_failure_total",
83+
"Total number of failed retries, categorized by operation type.",
84+
labelnames=("operation_type",),
85+
registry=self._registry,
86+
),
87+
pending_operations=Gauge(
88+
"sdk_pending_operations",
89+
"Current number of pending operations, categorized by type.",
90+
labelnames=("operation_type",),
91+
registry=self._registry,
92+
),
93+
)
94+
self.reset()
95+
96+
def __getattr__(self, item):
97+
try:
98+
return self._metrics[item]
99+
except KeyError:
100+
raise AttributeError(f"'{Metrics.__name__}' object has no attribute '{item}'")
101+
102+
@contextmanager
103+
def measure(self, labels):
104+
start_ts = self.start(labels)
105+
error = None
106+
try:
107+
yield self
108+
except Exception as err:
109+
error = err
110+
finally:
111+
self.stop(labels, start_ts, error=error)
112+
113+
def start(self, labels):
114+
if not isinstance(labels, Iterable):
115+
labels = (labels,)
116+
117+
self.pending_operations.labels(*labels).inc()
118+
return time.time()
119+
120+
def stop(self, labels, start_time, attempts=1, error=None):
121+
duration = time.time() - start_time
122+
123+
if not isinstance(labels, Iterable):
124+
labels = (labels,)
125+
126+
self.operations_total.labels(*labels).inc()
127+
self.pending_operations.labels(*labels).dec()
128+
self.retry_attempts_total.labels(*labels).inc(attempts)
129+
130+
if error:
131+
self.errors_total.labels(*labels, type(error).__name__).inc()
132+
self.retries_failure_total.labels(*labels).inc(attempts)
133+
self.operations_failure_total.labels(*labels).inc()
134+
self.operation_latency_seconds.labels(*labels, OP_STATUS_FAILURE).observe(duration)
135+
return
136+
137+
self.retries_success_total.labels(*labels).inc(attempts)
138+
self.operations_success_total.labels(*labels).inc()
139+
self.operation_latency_seconds.labels(*labels, OP_STATUS_SUCCESS).observe(duration)
140+
141+
142+
def push(self):
143+
push_to_gateway(
144+
self._push_gtw,
145+
job=f"workload-{SDK_SERVICE_NAME}",
146+
registry=self._registry,
147+
grouping_key={
148+
"ref": REF,
149+
"sdk": SDK_SERVICE_NAME,
150+
"sdk_version": version("ydb"),
151+
},
152+
)
153+
154+
def reset(self):
155+
for m in self._metrics.values():
156+
m.clear()
157+
158+
self.push()
File renamed without changes.

tests/slo/src/runner.py renamed to tests/slo/runner.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,13 @@ def run_slo(args, driver, tb_name):
9191
logger.info("Max ID: %s", max_id)
9292

9393
metrics = Metrics(args.prom_pgw)
94-
if SDK_SERVICE_NAME == "sync-python-table":
94+
if SDK_SERVICE_NAME == "py-sync-table":
9595
futures = (
9696
*run_read_jobs(args, driver, tb_name, max_id, metrics),
9797
*run_write_jobs(args, driver, tb_name, max_id, metrics),
9898
run_metric_job(args, metrics),
9999
)
100-
elif SDK_SERVICE_NAME == "sync-python-query":
100+
elif SDK_SERVICE_NAME == "py-sync-query":
101101
futures = (
102102
*run_read_jobs_query(args, driver, tb_name, max_id, metrics),
103103
*run_write_jobs_query(args, driver, tb_name, max_id, metrics),
@@ -121,7 +121,6 @@ def run_from_args(args):
121121
driver_config = ydb.DriverConfig(
122122
args.endpoint,
123123
database=args.db,
124-
credentials=ydb.credentials_from_env_variables(),
125124
grpc_keep_alive_timeout=5000,
126125
)
127126

0 commit comments

Comments
 (0)