Skip to content

Commit 8480996

Browse files
committed
ci: migrate to new slo action
1 parent cea4669 commit 8480996

File tree

11 files changed

+225
-195
lines changed

11 files changed

+225
-195
lines changed

.github/workflows/slo.yml

Lines changed: 58 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,76 @@
1-
name: SLO
1+
name: slo
22

33
on:
4+
push:
5+
branches:
6+
- main
47
pull_request:
5-
branches: [main]
8+
branches:
9+
- main
10+
- release-*
611
workflow_dispatch:
12+
inputs:
13+
github_pull_request_number:
14+
required: true
15+
slo_workload_duration_seconds:
16+
default: '600'
17+
required: false
18+
slo_workload_read_max_rps:
19+
default: '1000'
20+
required: false
21+
slo_workload_write_max_rps:
22+
default: '100'
23+
required: false
724

825
jobs:
9-
test-slo:
10-
concurrency:
11-
group: slo-${{ github.ref }}
26+
ydb-slo-action-init:
1227
if: (!contains(github.event.pull_request.labels.*.name, 'no slo'))
1328

29+
name: Run YDB SLO Tests
1430
runs-on: ubuntu-latest
15-
name: SLO test
16-
permissions:
17-
checks: write
18-
pull-requests: write
19-
contents: read
20-
issues: write
31+
32+
strategy:
33+
matrix:
34+
sdk:
35+
- py-sync-table
36+
- py-sync-query
2137

2238
steps:
2339
- name: Checkout repository
24-
uses: actions/checkout@v3
25-
if: env.DOCKER_REPO != null
26-
env:
27-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
40+
uses: actions/checkout@v4
2841

29-
- name: Run SLO
30-
uses: ydb-platform/slo-tests@main
31-
if: env.DOCKER_REPO != null
32-
env:
33-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
34-
continue-on-error: true
42+
- name: Install Python3
43+
uses: actions/setup-python@v5
3544
with:
36-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
37-
KUBECONFIG_B64: ${{ secrets.SLO_KUBE_CONFIG }}
38-
AWS_CREDENTIALS_B64: ${{ secrets.SLO_AWS_CREDENTIALS }}
39-
AWS_CONFIG_B64: ${{ secrets.SLO_AWS_CONFIG }}
40-
DOCKER_USERNAME: ${{ secrets.SLO_DOCKER_USERNAME }}
41-
DOCKER_PASSWORD: ${{ secrets.SLO_DOCKER_PASSWORD }}
42-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
43-
DOCKER_FOLDER: ${{ secrets.SLO_DOCKER_FOLDER }}
44-
s3_endpoint: ${{ secrets.SLO_S3_ENDPOINT }}
45-
s3_images_folder: ${{ vars.SLO_S3_IMAGES_FOLDER }}
46-
grafana_domain: ${{ vars.SLO_GRAFANA_DOMAIN }}
47-
grafana_dashboard: ${{ vars.SLO_GRAFANA_DASHBOARD }}
48-
ydb_version: 'newest'
49-
timeBetweenPhases: 30
50-
shutdownTime: 30
45+
python-version: '3.8'
46+
cache: 'pip'
5147

52-
language_id0: sync-python-table
53-
language0: Python SDK over Table Service
54-
workload_path0: tests/slo
55-
workload_build_context0: ../..
56-
workload_build_options0: -f Dockerfile --build-arg SDK_SERVICE=sync-python-table
48+
- name: Install dependencies
49+
run: |
50+
python -m pip install --no-cache-dir --upgrade pip
51+
python -m pip install --no-cache-dir -e .
52+
python -m pip install --no-cache-dir -r tests/slo/requirements.txt
5753
58-
language_id1: sync-python-query
59-
language1: Python SDK over Query Service
60-
workload_path1: tests/slo
61-
workload_build_context1: ../..
62-
workload_build_options1: -f Dockerfile --build-arg SDK_SERVICE=sync-python-query
54+
- name: Initialize YDB SLO
55+
uses: ydb-platform/ydb-slo-action/init@main
56+
with:
57+
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
58+
github_token: ${{ secrets.GITHUB_TOKEN }}
59+
sdk_name: ${{ matrix.sdk }}
6360

64-
- uses: actions/upload-artifact@v3
65-
if: env.DOCKER_REPO != null
61+
- name: Run SLO Tests
6662
env:
67-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
68-
with:
69-
name: slo-logs
70-
path: logs/
63+
REF: '${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}'
64+
SDK_SERVICE: '${{ matrix.sdk }}'
65+
run: |
66+
python ./tests/slo create grpc://localhost:2135 /Root/testdb
67+
python ./tests/slo run grpc://localhost:2135 /Root/testdb \
68+
--prom-pgw localhost:9091 \
69+
--report-period 250 \
70+
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
71+
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
72+
--read-timeout 10000 \
73+
--write-timeout 10000 \
74+
--time ${{inputs.slo_workload_duration_seconds || 600}} \
75+
--shutdown-time 30
76+
python ./tests/slo cleanup grpc://localhost:2135 /Root/testdb

tests/slo/Dockerfile

Lines changed: 0 additions & 11 deletions
This file was deleted.
File renamed without changes.
File renamed without changes.
File renamed without changes.

tests/slo/src/jobs.py renamed to tests/slo/jobs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import threading
1010

11-
from metrics import Metrics, JOB_WRITE_LABEL, JOB_READ_LABEL
11+
from metrics import Metrics, OP_TYPE_WRITE, OP_TYPE_READ
1212

1313
from generator import RowGenerator
1414

@@ -106,7 +106,7 @@ def check_result(result):
106106
query=query,
107107
params=params,
108108
metrics=metrics,
109-
labels=(JOB_READ_LABEL,),
109+
labels=(OP_TYPE_READ,),
110110
request_settings=request_settings,
111111
retry_settings=retry_setting,
112112
check_result_cb=check_result,
@@ -163,7 +163,7 @@ def check_result(result):
163163
query=query,
164164
params=params,
165165
metrics=metrics,
166-
labels=(JOB_READ_LABEL,),
166+
labels=(OP_TYPE_READ,),
167167
request_settings=request_settings,
168168
retry_settings=retry_setting,
169169
check_result_cb=check_result,
@@ -220,7 +220,7 @@ def run_writes(driver, query, row_generator, metrics, limiter, runtime, timeout)
220220
query=query,
221221
params=params,
222222
metrics=metrics,
223-
labels=(JOB_WRITE_LABEL,),
223+
labels=(OP_TYPE_WRITE,),
224224
request_settings=request_settings,
225225
retry_settings=retry_setting,
226226
)
@@ -285,7 +285,7 @@ def check_result(result):
285285
query=query,
286286
params=params,
287287
metrics=metrics,
288-
labels=(JOB_WRITE_LABEL,),
288+
labels=(OP_TYPE_WRITE,),
289289
request_settings=request_settings,
290290
retry_settings=retry_setting,
291291
check_result_cb=check_result,

tests/slo/metrics.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import time
2+
from contextlib import contextmanager
3+
from importlib.metadata import version
4+
from collections.abc import Iterable
5+
6+
from os import environ
7+
8+
environ["PROMETHEUS_DISABLE_CREATED_SERIES"] = "True"
9+
10+
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, push_to_gateway # noqa: E402
11+
12+
OP_TYPE_READ, OP_TYPE_WRITE = "read", "write"
13+
OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err"
14+
15+
REF = environ.get("REF", "main")
16+
SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "py-sync-table")
17+
18+
19+
class Metrics:
20+
def __init__(self, push_gateway):
21+
self._push_gtw = push_gateway
22+
self._registry = CollectorRegistry()
23+
self._metrics = dict(
24+
errors_total=Counter(
25+
"sdk_errors_total",
26+
"Total number of errors encountered, categorized by error type.",
27+
labelnames=("operation_type", "error_type"),
28+
registry=self._registry,
29+
),
30+
operations_total=Counter(
31+
"sdk_operations_total",
32+
"Total number of operations, categorized by type attempted by the SDK.",
33+
labelnames=("operation_type",),
34+
registry=self._registry,
35+
),
36+
operations_success_total=Counter(
37+
"sdk_operations_success_total",
38+
"Total number of successful operations, categorized by type.",
39+
labelnames=("operation_type",),
40+
registry=self._registry,
41+
),
42+
operations_failure_total=Counter(
43+
"sdk_operations_failure_total",
44+
"Total number of failed operations, categorized by type.",
45+
labelnames=("operation_type",),
46+
registry=self._registry,
47+
),
48+
operation_latency_seconds=Histogram(
49+
"sdk_operation_latency_seconds",
50+
"Latency of operations performed by the SDK in seconds, categorized by type and status.",
51+
labelnames=(
52+
"operation_type",
53+
"operation_status",
54+
),
55+
registry=self._registry,
56+
buckets=(
57+
0.001, # 1 ms
58+
0.002, # 2 ms
59+
0.003, # 3 ms
60+
0.004, # 4 ms
61+
0.005, # 5 ms
62+
0.0075, # 7.5 ms
63+
0.010, # 10 ms
64+
0.020, # 20 ms
65+
0.050, # 50 ms
66+
0.100, # 100 ms
67+
0.200, # 200 ms
68+
0.500, # 500 ms
69+
1.000, # 1 s
70+
)
71+
),
72+
retry_attempts_total=Counter(
73+
"sdk_retry_attempts_total",
74+
"Total number of retry attempts, categorized by operation type.",
75+
labelnames=("operation_type",),
76+
registry=self._registry,
77+
),
78+
retries_success_total=Counter(
79+
"sdk_retries_success_total",
80+
"Total number of successful retries, categorized by operation type.",
81+
labelnames=("operation_type",),
82+
registry=self._registry,
83+
),
84+
retries_failure_total=Counter(
85+
"sdk_retries_failure_total",
86+
"Total number of failed retries, categorized by operation type.",
87+
labelnames=("operation_type",),
88+
registry=self._registry,
89+
),
90+
pending_operations=Gauge(
91+
"sdk_pending_operations",
92+
"Current number of pending operations, categorized by type.",
93+
labelnames=("operation_type",),
94+
registry=self._registry,
95+
),
96+
)
97+
self.reset()
98+
99+
def __getattr__(self, item):
100+
try:
101+
return self._metrics[item]
102+
except KeyError:
103+
raise AttributeError(f"'{Metrics.__name__}' object has no attribute '{item}'")
104+
105+
@contextmanager
106+
def measure(self, labels):
107+
start_ts = self.start(labels)
108+
error = None
109+
try:
110+
yield self
111+
except Exception as err:
112+
error = err
113+
finally:
114+
self.stop(labels, start_ts, error=error)
115+
116+
def start(self, labels):
117+
if not isinstance(labels, Iterable):
118+
labels = (labels,)
119+
120+
self.pending_operations.labels(*labels).inc()
121+
return time.time()
122+
123+
def stop(self, labels, start_time, attempts=1, error=None):
124+
duration = time.time() - start_time
125+
126+
if not isinstance(labels, Iterable):
127+
labels = (labels,)
128+
129+
self.operations_total.labels(*labels).inc()
130+
self.pending_operations.labels(*labels).dec()
131+
self.retry_attempts_total.labels(*labels).inc(attempts)
132+
133+
if error:
134+
self.errors_total.labels(*labels, type(error).__name__).inc()
135+
self.retries_failure_total.labels(*labels).inc(attempts)
136+
self.operations_failure_total.labels(*labels).inc()
137+
self.operation_latency_seconds.labels(*labels, OP_STATUS_FAILURE).observe(duration)
138+
return
139+
140+
self.retries_success_total.labels(*labels).inc(attempts)
141+
self.operations_success_total.labels(*labels).inc()
142+
self.operation_latency_seconds.labels(*labels, OP_STATUS_SUCCESS).observe(duration)
143+
144+
def push(self):
145+
push_to_gateway(
146+
self._push_gtw,
147+
job=f"workload-{SDK_SERVICE_NAME}",
148+
registry=self._registry,
149+
grouping_key={
150+
"ref": REF,
151+
"sdk": SDK_SERVICE_NAME,
152+
"sdk_version": version("ydb"),
153+
},
154+
)
155+
156+
def reset(self):
157+
for m in self._metrics.values():
158+
m.clear()
159+
160+
self.push()
File renamed without changes.

tests/slo/src/runner.py renamed to tests/slo/runner.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,13 @@ def run_slo(args, driver, tb_name):
9191
logger.info("Max ID: %s", max_id)
9292

9393
metrics = Metrics(args.prom_pgw)
94-
if SDK_SERVICE_NAME == "sync-python-table":
94+
if SDK_SERVICE_NAME == "py-sync-table":
9595
futures = (
9696
*run_read_jobs(args, driver, tb_name, max_id, metrics),
9797
*run_write_jobs(args, driver, tb_name, max_id, metrics),
9898
run_metric_job(args, metrics),
9999
)
100-
elif SDK_SERVICE_NAME == "sync-python-query":
100+
elif SDK_SERVICE_NAME == "py-sync-query":
101101
futures = (
102102
*run_read_jobs_query(args, driver, tb_name, max_id, metrics),
103103
*run_write_jobs_query(args, driver, tb_name, max_id, metrics),
@@ -121,7 +121,6 @@ def run_from_args(args):
121121
driver_config = ydb.DriverConfig(
122122
args.endpoint,
123123
database=args.db,
124-
credentials=ydb.credentials_from_env_variables(),
125124
grpc_keep_alive_timeout=5000,
126125
)
127126

0 commit comments

Comments
 (0)