diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 1e9451c0..34cd9511 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -1,75 +1,83 @@ +name: SLO + on: push: - branches: [main] + branches: + - main pull_request: - branches: [main] + branches: + - main workflow_dispatch: - -name: SLO + inputs: + github_pull_request_number: + required: true + slo_workload_duration_seconds: + default: '600' + required: false + slo_workload_read_max_rps: + default: '1000' + required: false + slo_workload_write_max_rps: + default: '1000' + required: false jobs: - test-slo: - concurrency: - group: slo-${{ github.ref }} + ydb-slo-action: if: (!contains(github.event.pull_request.labels.*.name, 'no slo')) - runs-on: ubuntu-latest name: SLO test - permissions: - checks: write - pull-requests: write - contents: read - issues: write - + runs-on: ubuntu-latest + strategy: + matrix: + workload: + - AdoNet + + concurrency: + group: slo-${{ github.ref }}-${{ matrix.workload }} + cancel-in-progress: true + steps: - name: Checkout repository uses: actions/checkout@v4 - if: env.DOCKER_REPO != null - env: - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} - - name: Run SLO - uses: ydb-platform/slo-tests@main - if: env.DOCKER_REPO != null - env: - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} - continue-on-error: true + - name: Install .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: 8.0.x + + - name: Initialize YDB SLO + uses: ydb-platform/ydb-slo-action/init@main with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - KUBECONFIG_B64: ${{ secrets.SLO_KUBE_CONFIG }} - AWS_CREDENTIALS_B64: ${{ secrets.SLO_AWS_CREDENTIALS }} - AWS_CONFIG_B64: ${{ secrets.SLO_AWS_CONFIG }} - DOCKER_USERNAME: ${{ secrets.SLO_DOCKER_USERNAME }} - DOCKER_PASSWORD: ${{ secrets.SLO_DOCKER_PASSWORD }} - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} - DOCKER_FOLDER: ${{ secrets.SLO_DOCKER_FOLDER }} - s3_endpoint: ${{ secrets.SLO_S3_ENDPOINT }} - s3_images_folder: ${{ vars.SLO_S3_IMAGES_FOLDER }} - grafana_domain: ${{ vars.SLO_GRAFANA_DOMAIN }} - # grafana_dashboard: ${{ vars.SLO_GRAFANA_DASHBOARD }} - grafana_dashboard: dca60386-0d3d-43f5-a2af-5f3fd3e3b295 - grafana_dashboard_width: 2000 - grafana_dashboard_height: 2300 - ydb_version: 'newest' - timeBetweenPhases: 30 - shutdownTime: 30 + github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }} + github_token: ${{ secrets.GITHUB_TOKEN }} + workload_name: ${{ matrix.workload }} + ydb_database_node_count: 5 + + - name: Prepare SLO Database + run: | + cd slo/src/${{ matrix.workload }} + dotnet run create grpc://localhost:2135 /Root/testdb + + - name: Run SLO Tests + run: | + cd slo/src/${{ matrix.workload }} + dotnet run run grpc://localhost:2135 /Root/testdb \ + --prom-pgw http://localhost:9091 \ + --report-period 250 \ + --time ${{inputs.slo_workload_duration_seconds || 600 }} \ + --read-rps ${{inputs.slo_workload_read_max_rps || 1000 }} \ + --write-rps ${{inputs.slo_workload_write_max_rps || 1000 }} \ + --read-timeout 1000 \ + --write-timeout 1000 - language_id0: 'table-service' - workload_path0: 'slo/src' - language0: '.NET SDK over table-service' - workload_build_context0: ../.. - workload_build_options0: -f Dockerfile --build-arg SRC_PATH=TableService - - language_id1: 'ado-net' - workload_path1: 'slo/src' - language1: 'ADO.NET over query-service' - workload_build_context1: ../.. - workload_build_options1: -f Dockerfile --build-arg SRC_PATH=AdoNet + - if: always() + name: Store ydb chaos testing logs + run: | + docker logs ydb-chaos > chaos-ydb.log - - uses: actions/upload-artifact@v3 - if: always() && env.DOCKER_REPO != null - env: - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} + - if: always() + uses: actions/upload-artifact@v4 with: - name: slo-logs - path: logs/ + name: ${{ matrix.workload }}-chaos-ydb.log + path: ./chaos-ydb.log + retention-days: 1 diff --git a/slo/playground/configs/chaos.sh b/slo/playground/configs/chaos.sh new file mode 100644 index 00000000..f450437e --- /dev/null +++ b/slo/playground/configs/chaos.sh @@ -0,0 +1,40 @@ +#!/bin/sh -e + +get_random_container() { + # Get a list of all containers starting with ydb-database-* + containers=$(docker ps --format '{{.Names}}' | grep '^ydb-database-') + + # Convert the list to a newline-separated string + containers=$(echo "$containers" | tr ' ' '\n') + + # Count the number of containers + containersCount=$(echo "$containers" | wc -l) + + # Generate a random number between 0 and containersCount - 1 + randomIndex=$(shuf -i 0-$(($containersCount - 1)) -n 1) + + # Get the container name at the random index + nodeForChaos=$(echo "$containers" | sed -n "$(($randomIndex + 1))p") +} + + +sleep 60 + +get_random_container + +sh -c "docker stop ${nodeForChaos} -t 30" +sh -c "docker start ${nodeForChaos}" + +sleep 60 + +get_random_container + +sh -c "docker restart ${nodeForChaos} -t 0" + +sleep 60 + +get_random_container + +sh -c "docker kill -s SIGKILL ${nodeForChaos}" + +sleep 60 diff --git a/slo/playground/configs/compose.yaml b/slo/playground/configs/compose.yaml new file mode 100644 index 00000000..5fa387c6 --- /dev/null +++ b/slo/playground/configs/compose.yaml @@ -0,0 +1,311 @@ +# Code generated by Github Action; DO NOT EDIT. + +x-runtime: &runtime + hostname: localhost + platform: linux/amd64 + privileged: true + network_mode: host + +x-ydb-node: &ydb-node + image: cr.yandex/crptqonuodf51kdj7a7d/ydb:24.2.7 + restart: always + <<: *runtime + volumes: + - ./ydb.yaml:/opt/ydb/cfg/config.yaml + +name: ydb + +services: + static-0: + <<: *ydb-node + container_name: ydb-static-0 + command: + - /opt/ydb/bin/ydbd + - server + - --grpc-port + - "2135" + - --mon-port + - "8765" + - --ic-port + - "19001" + - --yaml-config + - /opt/ydb/cfg/config.yaml + - --node + - static + - --label + - deployment=docker + ports: + - 2135:2135 + - 8765:8765 + - 19001:19001 + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2135" + interval: 10s + timeout: 1s + retries: 3 + start_period: 30s + + static-init: + <<: *ydb-node + restart: on-failure + container_name: ydb-static-init + command: + - /opt/ydb/bin/ydbd + - -s + - grpc://localhost:2135 + - admin + - blobstorage + - config + - init + - --yaml-file + - /opt/ydb/cfg/config.yaml + depends_on: + static-0: + condition: service_healthy + + tenant-init: + <<: *ydb-node + restart: on-failure + container_name: ydb-tenant-init + command: + - /opt/ydb/bin/ydbd + - -s + - grpc://localhost:2135 + - admin + - database + - /Root/testdb + - create + - ssd:1 + depends_on: + static-init: + condition: service_completed_successfully + + database-1: + <<: *ydb-node + container_name: ydb-database-1 + command: + - /opt/ydb/bin/ydbd + - server + - --grpc-port + - "2136" + - --mon-port + - "8766" + - --ic-port + - "19002" + - --yaml-config + - /opt/ydb/cfg/config.yaml + - --tenant + - /Root/testdb + - --node-broker + - grpc://localhost:2135 + - --label + - deployment=docker + ports: + - 2136:2136 + - 8766:8766 + - 19002:19002 + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2136" + interval: 10s + timeout: 1s + retries: 3 + start_period: 30s + depends_on: + static-0: + condition: service_healthy + static-init: + condition: service_completed_successfully + tenant-init: + condition: service_completed_successfully + + database-2: + <<: *ydb-node + container_name: ydb-database-2 + command: + - /opt/ydb/bin/ydbd + - server + - --grpc-port + - "2137" + - --mon-port + - "8767" + - --ic-port + - "19003" + - --yaml-config + - /opt/ydb/cfg/config.yaml + - --tenant + - /Root/testdb + - --node-broker + - grpc://localhost:2135 + - --label + - deployment=docker + ports: + - 2137:2137 + - 8767:8767 + - 19003:19003 + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2137" + interval: 10s + timeout: 1s + retries: 3 + start_period: 30s + depends_on: + static-0: + condition: service_healthy + static-init: + condition: service_completed_successfully + tenant-init: + condition: service_completed_successfully + + database-3: + <<: *ydb-node + container_name: ydb-database-3 + command: + - /opt/ydb/bin/ydbd + - server + - --grpc-port + - "2138" + - --mon-port + - "8768" + - --ic-port + - "19004" + - --yaml-config + - /opt/ydb/cfg/config.yaml + - --tenant + - /Root/testdb + - --node-broker + - grpc://localhost:2135 + - --label + - deployment=docker + ports: + - 2138:2138 + - 8768:8768 + - 19004:19004 + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2138" + interval: 10s + timeout: 1s + retries: 3 + start_period: 30s + depends_on: + static-0: + condition: service_healthy + static-init: + condition: service_completed_successfully + tenant-init: + condition: service_completed_successfully + + database-4: + <<: *ydb-node + container_name: ydb-database-4 + command: + - /opt/ydb/bin/ydbd + - server + - --grpc-port + - "2139" + - --mon-port + - "8769" + - --ic-port + - "19005" + - --yaml-config + - /opt/ydb/cfg/config.yaml + - --tenant + - /Root/testdb + - --node-broker + - grpc://localhost:2135 + - --label + - deployment=docker + ports: + - 2139:2139 + - 8769:8769 + - 19005:19005 + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2139" + interval: 10s + timeout: 1s + retries: 3 + start_period: 30s + depends_on: + static-0: + condition: service_healthy + static-init: + condition: service_completed_successfully + tenant-init: + condition: service_completed_successfully + + database-5: + <<: *ydb-node + container_name: ydb-database-5 + command: + - /opt/ydb/bin/ydbd + - server + - --grpc-port + - "2140" + - --mon-port + - "8770" + - --ic-port + - "19006" + - --yaml-config + - /opt/ydb/cfg/config.yaml + - --tenant + - /Root/testdb + - --node-broker + - grpc://localhost:2135 + - --label + - deployment=docker + ports: + - 2140:2140 + - 8770:8770 + - 19006:19006 + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2140" + interval: 10s + timeout: 1s + retries: 3 + start_period: 30s + depends_on: + static-0: + condition: service_healthy + static-init: + condition: service_completed_successfully + tenant-init: + condition: service_completed_successfully + + prometheus: + image: prom/prometheus + restart: unless-stopped + <<: *runtime + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + deploy: &monitoring-deploy + resources: + limits: + cpus: "0.1" + memory: 1000M + reservations: + cpus: "0.001" + memory: 50M + + prometheus-pushgateway: + image: prom/pushgateway + restart: unless-stopped + <<: *runtime + ports: + - "9091:9091" + deploy: + <<: *monitoring-deploy + +# chaos: +# image: docker:latest +# restart: on-failure +# container_name: ydb-chaos +# <<: *runtime +# entrypoint: ["/bin/sh", "-c", "ls -la /opt/ydb && /opt/ydb/chaos.sh"] +# volumes: +# - ./chaos.sh:/opt/ydb/chaos.sh +# - ./ydb.yaml:/opt/ydb/cfg/config.yaml +# - /var/run/docker.sock:/var/run/docker.sock +# depends_on: +# static-0: +# condition: service_healthy diff --git a/slo/playground/configs/grafana/provisioning/dashboards/dashboard.yml b/slo/playground/configs/grafana/provisioning/dashboards/dashboard.yml deleted file mode 100644 index c6784142..00000000 --- a/slo/playground/configs/grafana/provisioning/dashboards/dashboard.yml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: 1 - -providers: - - name: 'SLO' - options: - path: /etc/grafana/provisioning/dashboards diff --git a/slo/playground/configs/grafana/provisioning/dashboards/slo.json b/slo/playground/configs/grafana/provisioning/dashboards/slo.json deleted file mode 100644 index 69d76bf7..00000000 --- a/slo/playground/configs/grafana/provisioning/dashboards/slo.json +++ /dev/null @@ -1,646 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 12, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "rate(oks[$__rate_interval]) > 0", - "hide": false, - "legendFormat": "({{sdk}}-{{sdkVersion}}) {{jobName}} OK", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "rate(not_oks[$__rate_interval]) > 0", - "hide": false, - "legendFormat": "({{sdk}}-{{sdkVersion}}) {{jobName}} not OK", - "range": true, - "refId": "C" - } - ], - "title": "SLO Requests RPS", - "transformations": [], - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 14, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "histogram_quantile(1, rate(attempts_bucket[$__rate_interval]))", - "hide": false, - "legendFormat": "{{sdk}}-{{sdkVersion}} {{jobName}}-{{status}}", - "range": true, - "refId": "A" - } - ], - "title": "Attempts", - "transformations": [], - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 - }, - "id": 7, - "panels": [], - "title": "Latencies", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "latency{jobName=\"read\", status=\"ok\"} > 0", - "legendFormat": "{{sdk}}-{{sdkVersion}}-p{{quantile}}", - "range": true, - "refId": "A" - } - ], - "title": "Read Latencies (OK)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 10 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "latency{jobName=\"write\", status=\"ok\"} > 0", - "legendFormat": "{{sdk}}-{{sdkVersion}}-p{{quantile}}", - "range": true, - "refId": "A" - } - ], - "title": "Write Latencies (OK)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 18 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "latency{jobName=\"read\", status=\"err\"} > 0", - "legendFormat": "{{sdk}}-{{sdkVersion}}-p{{quantile}}", - "range": true, - "refId": "A" - } - ], - "title": "Read Latencies (NOT OK)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 18 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "builder", - "expr": "latency{jobName=\"write\", status=\"err\"} > 0", - "legendFormat": "{{sdk}}-{{sdkVersion}}-p{{quantile}}", - "range": true, - "refId": "A" - } - ], - "title": "Write Latencies (NOT OK)", - "type": "timeseries" - } - ], - "refresh": "", - "revision": 1, - "schemaVersion": 38, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "filters": [], - "hide": 0, - "label": "", - "name": "filter", - "skipUrlSync": false, - "type": "adhoc" - } - ] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "SLO", - "uid": "7CzMl5t4k", - "version": 1, - "weekStart": "" -} diff --git a/slo/playground/configs/grafana/provisioning/datasources/datasource.yml b/slo/playground/configs/grafana/provisioning/datasources/datasource.yml deleted file mode 100644 index 0b62b9c3..00000000 --- a/slo/playground/configs/grafana/provisioning/datasources/datasource.yml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: 1 - -datasources: - - name: prometheus - type: prometheus - access: proxy - orgId: 1 - url: http://prometheus:9090 - basicAuth: false - isDefault: true - editable: true diff --git a/slo/playground/configs/prometheus.yml b/slo/playground/configs/prometheus.yml new file mode 100644 index 00000000..8c89c766 --- /dev/null +++ b/slo/playground/configs/prometheus.yml @@ -0,0 +1,9 @@ +global: + scrape_interval: 1s + evaluation_interval: 1s + +scrape_configs: + - job_name: "pushgateway" + static_configs: + - targets: + - localhost:9091 diff --git a/slo/playground/configs/prometheus/prometheus.yml b/slo/playground/configs/prometheus/prometheus.yml deleted file mode 100644 index 281b390b..00000000 --- a/slo/playground/configs/prometheus/prometheus.yml +++ /dev/null @@ -1,8 +0,0 @@ -global: - scrape_interval: 1s - evaluation_interval: 1s - -scrape_configs: - - job_name: 'slo' - static_configs: - - targets: ['prometheus-pushgateway:9091'] diff --git a/slo/playground/configs/ydb.yaml b/slo/playground/configs/ydb.yaml new file mode 100644 index 00000000..3e13b5d2 --- /dev/null +++ b/slo/playground/configs/ydb.yaml @@ -0,0 +1,60 @@ +actor_system_config: + cpu_count: 1 + node_type: STORAGE + use_auto_config: true +blob_storage_config: + service_set: + groups: + - erasure_species: none + rings: + - fail_domains: + - vdisk_locations: + - node_id: 1 + path: SectorMap:1:64 + pdisk_category: SSD +channel_profile_config: + profile: + - channel: + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: ssd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: ssd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: ssd + profile_id: 0 +domains_config: + domain: + - name: Root + storage_pool_types: + - kind: ssd + pool_config: + box_id: 1 + erasure_species: none + kind: ssd + pdisk_filter: + - property: + - type: SSD + vdisk_kind: Default + state_storage: + - ring: + node: [1] + nto_select: 1 + ssid: 1 +host_configs: + - drive: + - path: SectorMap:1:64 + type: SSD + host_config_id: 1 +hosts: + - host: localhost + host_config_id: 1 + node_id: 1 + port: 19001 + walle_location: + body: 1 + data_center: az-1 + rack: "0" +static_erasure: none diff --git a/slo/playground/docker-compose.yml b/slo/playground/docker-compose.yml deleted file mode 100644 index 5dee06d8..00000000 --- a/slo/playground/docker-compose.yml +++ /dev/null @@ -1,108 +0,0 @@ -version: '2.1' - -networks: - monitor-net: - driver: bridge - -services: - prometheus: - image: prom/prometheus:v2.44.0 - container_name: prometheus - user: "$UID:$GID" - volumes: - - ./configs/prometheus:/etc/prometheus - - ../data/prometheus:/prometheus - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--storage.tsdb.retention.time=200h' - - '--web.enable-lifecycle' - restart: unless-stopped - ports: - - "9090:9090" - networks: - - monitor-net - - prometheus-pushgateway: - image: prom/pushgateway:v1.6.0 - container_name: prometheus-pushgateway - ports: - - "9091:9091" - networks: - - monitor-net - - grafana: - image: grafana/grafana:9.5.3 - container_name: grafana - user: "$UID:$GID" - volumes: - - ./configs/grafana/provisioning:/etc/grafana/provisioning - - ../data/grafana:/var/lib/grafana - environment: - - GF_SECURITY_ADMIN_USER=admin - - GF_SECURITY_ADMIN_PASSWORD=passw0rD - restart: unless-stopped - ports: - - "3000:3000" - networks: - - monitor-net - - ydb: - image: cr.yandex/yc/yandex-docker-local-ydb:latest - container_name: ydb - environment: - - GRPC_TLS_PORT=2135 - - GRPC_PORT=2136 - - MON_PORT=8765 - - YDB_USE_IN_MEMORY_PDISKS=true - - YDB_DEFAULT_LOG_LEVEL=NOTICE - ports: - - "2135:2135" - - "2136:2136" - - "8765:8765" - volumes: - - ../data/ydb_certs:/ydb_certs - networks: - - monitor-net - - slo-create: - build: - context: ../.. - dockerfile: slo/src/Dockerfile - command: - - 'create' - - 'http://ydb:2136' - - '/local' - - '--table-name' - - 'slo-dotnet' - - '--min-partitions-count' - - '6' - - '--max-partitions-count' - - '1000' - - '--partition-size' - - '1' - - '--initial-data-count' - - '1000' - networks: - - monitor-net - depends_on: - ydb: - condition: service_healthy - - slo-run: - build: - context: ../.. - dockerfile: slo/src/Dockerfile - command: - - 'run' - - 'http://ydb:2136' - - '/local' - - '--prom-pgw' - - 'http://prometheus-pushgateway:9091' - - '--table-name' - - 'slo-dotnet' - networks: - - monitor-net - depends_on: - slo-create: - condition: service_completed_successfully diff --git a/slo/src/AdoNet/SloContext.cs b/slo/src/AdoNet/SloContext.cs index 5ead9eec..32cd965e 100644 --- a/slo/src/AdoNet/SloContext.cs +++ b/slo/src/AdoNet/SloContext.cs @@ -14,13 +14,13 @@ public class SloContext : SloContext .WaitAndRetryAsync(10, attempt => TimeSpan.FromMilliseconds(attempt * 10), (e, _, _, context) => { - var errorsGauge = (Gauge)context["errorsGauge"]; + var errorsTotal = (Counter)context["errorsTotal"]; Logger.LogWarning(e, "Failed read / write operation"); - errorsGauge?.WithLabels(((YdbException)e).Code.StatusName(), "retried").Inc(); + errorsTotal?.WithLabels(((YdbException)e).Code.StatusName(), "retried").Inc(); }); - protected override string Job => "workload-ado-net"; + protected override string Job => "AdoNet"; protected override async Task Create(YdbDataSource client, string createTableSql, int operationTimeout) { @@ -32,12 +32,12 @@ protected override async Task Create(YdbDataSource client, string createTableSql } protected override async Task<(int, StatusCode)> Upsert(YdbDataSource dataSource, string upsertSql, - Dictionary parameters, int writeTimeout, Gauge? errorsGauge = null) + Dictionary parameters, int writeTimeout, Counter? errorsTotal = null) { var context = new Context(); - if (errorsGauge != null) + if (errorsTotal != null) { - context["errorsGauge"] = errorsGauge; + context["errorsTotal"] = errorsTotal; } var policyResult = await _policy.ExecuteAndCaptureAsync(async _ => @@ -61,12 +61,12 @@ protected override async Task Create(YdbDataSource client, string createTableSql } protected override async Task<(int, StatusCode, object?)> Select(YdbDataSource dataSource, string selectSql, - Dictionary parameters, int readTimeout, Gauge? errorsGauge = null) + Dictionary parameters, int readTimeout, Counter? errorsTotal = null) { var context = new Context(); - if (errorsGauge != null) + if (errorsTotal != null) { - context["errorsGauge"] = errorsGauge; + context["errorsTotal"] = errorsTotal; } var attempts = 0; @@ -105,4 +105,4 @@ protected override Task CreateClient(Config config) return Task.FromResult(new YdbDataSource(new YdbConnectionStringBuilder { UseTls = useTls, Host = host, Port = int.Parse(port), Database = config.Db, LoggerFactory = Factory })); } -} \ No newline at end of file +} diff --git a/slo/src/Dockerfile b/slo/src/Dockerfile deleted file mode 100644 index cc42649d..00000000 --- a/slo/src/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build -ARG SRC_PATH -COPY ../ /src -RUN ls /src -WORKDIR /src/slo/src/${SRC_PATH} -RUN ls -RUN dotnet restore "${SRC_PATH}.csproj" -RUN dotnet publish "${SRC_PATH}.csproj" -c release -o /app --no-restore -f net8.0 - -##################### - -FROM mcr.microsoft.com/dotnet/runtime:8.0 AS run -WORKDIR /app -COPY --from=build /app ./ -ENTRYPOINT ["./slo"] diff --git a/slo/src/Internal/Cli.cs b/slo/src/Internal/Cli.cs index c8ff60c9..41e45a92 100644 --- a/slo/src/Internal/Cli.cs +++ b/slo/src/Internal/Cli.cs @@ -12,39 +12,19 @@ public static class Cli "db", "YDB database to connect to"); - private static readonly Option TableOption = new( - new[] { "-t", "--table-name" }, - () => "testingTable", - "table name to create\n "); + private static readonly Option PromPgwOption = new( + "--prom-pgw", + "prometheus push gateway") { IsRequired = true }; + + private static readonly Option ResourceYdbPath = new( + new[] { "-t", "--resource-ydb-path" }, + () => "test-resource", + "resource ydb path to create\n "); private static readonly Option WriteTimeoutOption = new( "--write-timeout", - () => 100, - "write timeout seconds"); - - private static readonly Option MinPartitionsCountOption = new( - "--min-partitions-count", - () => 6, - "minimum amount of partitions in table"); - - private static readonly Option MaxPartitionsCountOption = new( - "--max-partitions-count", () => 1000, - "maximum amount of partitions in table"); - - private static readonly Option PartitionSizeOption = new( - "--partition-size", - () => 1, - "partition size in mb"); - - private static readonly Option InitialDataCountOption = new( - new[] { "-c", "--initial-data-count" }, - () => 1000, - "amount of initially created rows"); - - private static readonly Option PromPgwOption = new( - "--prom-pgw", - "minimum amount of partitions in table") { IsRequired = true }; + "write timeout seconds"); private static readonly Option ReportPeriodOption = new( "--report-period", @@ -58,23 +38,33 @@ public static class Cli private static readonly Option ReadTimeoutOption = new( "--read-timeout", - () => 100, + () => 1000, "read timeout seconds"); private static readonly Option WriteRpsOption = new( "--write-rps", - () => 100, + () => 1000, "write RPS"); private static readonly Option TimeOption = new( "--time", - () => 140, + () => 600, "run time in seconds"); - private static readonly Option ShutdownTimeOption = new( - "--shutdown-time", - () => 30, - "time to wait before force kill workers"); + private static readonly Option MinPartitionsCountOption = new( + "--min-partitions-count", + () => 5, + "minimum amount of partitions in table"); + + private static readonly Option MaxPartitionsCountOption = new( + "--max-partitions-count", + () => 10, + "maximum amount of partitions in table"); + + private static readonly Option InitialDataCountOption = new( + new[] { "-c", "--initial-data-count" }, + () => 1000, + "amount of initially created rows"); private static readonly Command CreateCommand = new( "create", @@ -82,10 +72,9 @@ public static class Cli { EndpointArgument, DbArgument, - TableOption, + ResourceYdbPath, MinPartitionsCountOption, MaxPartitionsCountOption, - PartitionSizeOption, InitialDataCountOption, WriteTimeoutOption }; @@ -96,7 +85,7 @@ public static class Cli { EndpointArgument, DbArgument, - TableOption, + ResourceYdbPath, InitialDataCountOption, PromPgwOption, ReportPeriodOption, @@ -104,8 +93,7 @@ public static class Cli ReadTimeoutOption, WriteRpsOption, WriteTimeoutOption, - TimeOption, - ShutdownTimeOption + TimeOption }; private static readonly RootCommand RootCommand = new("SLO app") @@ -116,13 +104,13 @@ public static class Cli public static async Task Run(SloContext sloContext, string[] args) where T : IDisposable { CreateCommand.SetHandler(async createConfig => { await sloContext.Create(createConfig); }, - new CreateConfigBinder(EndpointArgument, DbArgument, TableOption, MinPartitionsCountOption, - MaxPartitionsCountOption, PartitionSizeOption, InitialDataCountOption, WriteTimeoutOption)); + new CreateConfigBinder(EndpointArgument, DbArgument, ResourceYdbPath, MinPartitionsCountOption, + MaxPartitionsCountOption, InitialDataCountOption, WriteTimeoutOption)); RunCommand.SetHandler(async runConfig => { await sloContext.Run(runConfig); }, - new RunConfigBinder(EndpointArgument, DbArgument, TableOption, PromPgwOption, ReportPeriodOption, - ReadRpsOption, ReadTimeoutOption, WriteRpsOption, WriteTimeoutOption, TimeOption, ShutdownTimeOption)); + new RunConfigBinder(EndpointArgument, DbArgument, ResourceYdbPath, PromPgwOption, ReportPeriodOption, + ReadRpsOption, ReadTimeoutOption, WriteRpsOption, WriteTimeoutOption, TimeOption)); return await RootCommand.InvokeAsync(args); } -} \ No newline at end of file +} diff --git a/slo/src/Internal/ConfigBinders.cs b/slo/src/Internal/ConfigBinders.cs index 9a668e14..6733ff8d 100644 --- a/slo/src/Internal/ConfigBinders.cs +++ b/slo/src/Internal/ConfigBinders.cs @@ -6,10 +6,9 @@ namespace Internal; public class CreateConfigBinder( Argument endpointArgument, Argument dbArgument, - Option tableOption, + Option resourceYdbPath, Option minPartitionsCountOption, Option maxPartitionsCountOption, - Option partitionSizeOption, Option initialDataCountOption, Option writeTimeoutOption) : BinderBase @@ -19,46 +18,26 @@ protected override CreateConfig GetBoundValue(BindingContext bindingContext) return new CreateConfig( bindingContext.ParseResult.GetValueForArgument(endpointArgument), bindingContext.ParseResult.GetValueForArgument(dbArgument), - bindingContext.ParseResult.GetValueForOption(tableOption)!, + bindingContext.ParseResult.GetValueForOption(resourceYdbPath)!, bindingContext.ParseResult.GetValueForOption(minPartitionsCountOption), bindingContext.ParseResult.GetValueForOption(maxPartitionsCountOption), - bindingContext.ParseResult.GetValueForOption(partitionSizeOption), bindingContext.ParseResult.GetValueForOption(initialDataCountOption), bindingContext.ParseResult.GetValueForOption(writeTimeoutOption) ); } } -internal class CleanUpConfigBinder( - Argument endpointArgument, - Argument dbArgument, - Option tableOption, - Option writeTimeoutOption) - : BinderBase -{ - protected override CleanUpConfig GetBoundValue(BindingContext bindingContext) - { - return new CleanUpConfig( - bindingContext.ParseResult.GetValueForArgument(endpointArgument), - bindingContext.ParseResult.GetValueForArgument(dbArgument), - bindingContext.ParseResult.GetValueForOption(tableOption)!, - bindingContext.ParseResult.GetValueForOption(writeTimeoutOption) - ); - } -} - internal class RunConfigBinder( Argument endpointArgument, Argument dbArgument, - Option tableOption, + Option resourceYdbPath, Option promPgwOption, Option reportPeriodOption, Option readRpsOption, Option readTimeoutOption, Option writeRpsOption, Option writeTimeoutOption, - Option timeOption, - Option shutdownTimeOption) + Option timeOption) : BinderBase { protected override RunConfig GetBoundValue(BindingContext bindingContext) @@ -66,15 +45,14 @@ protected override RunConfig GetBoundValue(BindingContext bindingContext) return new RunConfig( bindingContext.ParseResult.GetValueForArgument(endpointArgument), bindingContext.ParseResult.GetValueForArgument(dbArgument), - bindingContext.ParseResult.GetValueForOption(tableOption)!, + bindingContext.ParseResult.GetValueForOption(resourceYdbPath)!, bindingContext.ParseResult.GetValueForOption(promPgwOption)!, bindingContext.ParseResult.GetValueForOption(reportPeriodOption), bindingContext.ParseResult.GetValueForOption(readRpsOption), bindingContext.ParseResult.GetValueForOption(readTimeoutOption), bindingContext.ParseResult.GetValueForOption(writeRpsOption), bindingContext.ParseResult.GetValueForOption(writeTimeoutOption), - bindingContext.ParseResult.GetValueForOption(timeOption), - bindingContext.ParseResult.GetValueForOption(shutdownTimeOption) + bindingContext.ParseResult.GetValueForOption(timeOption) ); } } \ No newline at end of file diff --git a/slo/src/Internal/Configs.cs b/slo/src/Internal/Configs.cs index 119361bd..025aefe5 100644 --- a/slo/src/Internal/Configs.cs +++ b/slo/src/Internal/Configs.cs @@ -3,27 +3,22 @@ namespace Internal; public record CreateConfig( string Endpoint, string Db, - string TableName, + string ResourcePathYdb, int MinPartitionsCount, int MaxPartitionsCount, - int PartitionSize, int InitialDataCount, - int WriteTimeout) : Config(Endpoint, Db, TableName, WriteTimeout); - -public record CleanUpConfig(string Endpoint, string Db, string TableName, int WriteTimeout) - : Config(Endpoint, Db, TableName, WriteTimeout); + int WriteTimeout) : Config(Endpoint, Db, ResourcePathYdb, WriteTimeout); public record RunConfig( string Endpoint, string Db, - string TableName, + string ResourcePathYdb, string PromPgw, int ReportPeriod, int ReadRps, int ReadTimeout, int WriteRps, int WriteTimeout, - int Time, - int ShutdownTime) : Config(Endpoint, Db, TableName, WriteTimeout); + int Time) : Config(Endpoint, Db, ResourcePathYdb, WriteTimeout); -public record Config(string Endpoint, string Db, string TableName, int WriteTimeout); \ No newline at end of file +public record Config(string Endpoint, string Db, string ResourcePathYdb, int WriteTimeout); \ No newline at end of file diff --git a/slo/src/Internal/SloContext.cs b/slo/src/Internal/SloContext.cs index 792b7f68..c9044840 100644 --- a/slo/src/Internal/SloContext.cs +++ b/slo/src/Internal/SloContext.cs @@ -11,7 +11,7 @@ public abstract class SloContext where T : IDisposable { // ReSharper disable once StaticMemberInGenericType protected static readonly ILoggerFactory Factory = - LoggerFactory.Create(builder => builder.AddConsole().SetMinimumLevel(LogLevel.Debug)); + LoggerFactory.Create(builder => builder.AddConsole().SetMinimumLevel(LogLevel.Information)); protected static readonly ILogger Logger = Factory.CreateLogger>(); @@ -26,11 +26,11 @@ public async Task Create(CreateConfig config) using var client = await CreateClient(config); for (var attempt = 0; attempt < maxCreateAttempts; attempt++) { - Logger.LogInformation("Creating table {TableName}..", config.TableName); + Logger.LogInformation("Creating table {ResourcePathYdb}..", config.ResourcePathYdb); try { var createTableSql = $""" - CREATE TABLE `{config.TableName}` ( + CREATE TABLE `{config.ResourcePathYdb}` ( hash Uint64, id Int32, payload_str Text, @@ -41,7 +41,6 @@ PRIMARY KEY (hash, id) ) WITH ( AUTO_PARTITIONING_BY_SIZE = ENABLED, AUTO_PARTITIONING_BY_LOAD = ENABLED, - AUTO_PARTITIONING_PARTITION_SIZE_MB = {config.PartitionSize}, AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = {config.MinPartitionsCount}, AUTO_PARTITIONING_MAX_PARTITIONS_COUNT = {config.MaxPartitionsCount} ); @@ -50,7 +49,7 @@ PRIMARY KEY (hash, id) await Create(client, createTableSql, config.WriteTimeout); - Logger.LogInformation("Created table {TableName}", config.TableName); + Logger.LogInformation("Created table {ResourcePathYdb}", config.ResourcePathYdb); break; } @@ -91,12 +90,15 @@ PRIMARY KEY (hash, id) public async Task Run(RunConfig runConfig) { + // Trace.Listeners.Add(new ConsoleTraceListener()); debug meterPusher + var promPgwEndpoint = $"{runConfig.PromPgw}/metrics"; var client = await CreateClient(runConfig); - using var prometheus = new MetricPusher(promPgwEndpoint, Job, intervalMilliseconds: runConfig.ReportPeriod); + using var prometheus = new MetricPusher(promPgwEndpoint, "workload-" + Job, + intervalMilliseconds: runConfig.ReportPeriod); prometheus.Start(); - var (_, _, maxId) = await Select(client, $"SELECT MAX(id) as max_id FROM `{runConfig.TableName}`;", + var (_, _, maxId) = await Select(client, $"SELECT MAX(id) as max_id FROM `{runConfig.ResourcePathYdb}`;", new Dictionary(), runConfig.ReadTimeout); _maxId = (int)maxId!; @@ -129,36 +131,79 @@ public async Task Run(RunConfig runConfig) } await prometheus.StopAsync(); - await MetricReset(promPgwEndpoint); + // await MetricReset(promPgwEndpoint); Logger.LogInformation("Run task is finished"); return; - Task ShootingTask(RateLimiter rateLimitPolicy, string jobName, - Func> action) + Task ShootingTask(RateLimiter rateLimitPolicy, string operationType, + Func> action) { var metricFactory = Metrics.WithLabels(new Dictionary - { { "jobName", jobName }, { "sdk", "dotnet" }, { "sdkVersion", Environment.Version.ToString() } }); - - var okGauge = metricFactory.CreateGauge("oks", "Count of OK"); - var notOkGauge = metricFactory.CreateGauge("not_oks", "Count of not OK"); - var latencySummary = metricFactory.CreateSummary("latency", "Latencies (OK)", new[] { "status" }, - new SummaryConfiguration { - MaxAge = TimeSpan.FromSeconds(15), Objectives = new QuantileEpsilonPair[] - { new(0.5, 0.05), new(0.99, 0.005), new(0.999, 0.0005) } - }); + { "operation_type", operationType }, + { "sdk", "dotnet" }, + { "sdk_version", Environment.Version.ToString() }, + { "workload", Job }, + { "workload_version", "0.0.0" } + } + ); + + var operationsTotal = metricFactory.CreateCounter( + "sdk_operations_total", + "Total number of operations performed by the SDK, categorized by type." + ); + + var operationsSuccessTotal = metricFactory.CreateCounter( + "sdk_operations_success_total", + "Total number of successful operations, categorized by type." + ); + + var operationsFailureTotal = metricFactory.CreateCounter( + "sdk_operations_failure_total", + "Total number of failed operations, categorized by type." + ); + + var operationLatencySeconds = metricFactory.CreateHistogram( + "sdk_operation_latency_seconds", + "Latency of operations performed by the SDK in seconds, categorized by type and status.", + ["operation_status"], + new HistogramConfiguration + { + Buckets = + [ + 0.001, // 1 ms + 0.002, // 2 ms + 0.003, // 3 ms + 0.004, // 4 ms + 0.005, // 5 ms + 0.0075, // 7.5 ms + 0.010, // 10 ms + 0.020, // 20 ms + 0.050, // 50 ms + 0.100, // 100 ms + 0.200, // 200 ms + 0.500, // 500 ms + 1.000 // 1 s + ] + } + ); - var attemptsHistogram = metricFactory.CreateHistogram("attempts", "summary of amount for request", - new[] { "status" }, - new HistogramConfiguration { Buckets = Histogram.LinearBuckets(1, 1, 10) }); + var retryAttempts = metricFactory.CreateGauge( + "sdk_retry_attempts", + "Current retry attempts, categorized by operation type." + ); - var errorsGauge = metricFactory.CreateGauge("errors", "amount of errors", new[] { "class", "in" }); - foreach (var statusCode in Enum.GetValues()) - { - errorsGauge.WithLabels(statusCode.StatusName(), "retried").IncTo(0); - errorsGauge.WithLabels(statusCode.StatusName(), "finally").IncTo(0); - } + var pendingOperations = metricFactory.CreateGauge( + "sdk_pending_operations", + "Current number of pending operations, categorized by type." + ); + + var errorsTotal = metricFactory.CreateCounter( + "sdk_errors_total", + "Total number of errors encountered, categorized by error type.", + ["error_type"] + ); // ReSharper disable once MethodSupportsCancellation return Task.Run(async () => @@ -175,45 +220,50 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string jobName, _ = Task.Run(async () => { + pendingOperations.Inc(); var sw = Stopwatch.StartNew(); - var (attempts, statusCode) = await action(client, runConfig, errorsGauge); + var (attempts, statusCode) = await action(client, runConfig, errorsTotal); sw.Stop(); - string label; + + retryAttempts.Set(attempts); + operationsTotal.Inc(); + pendingOperations.Dec(); if (statusCode != StatusCode.Success) { - notOkGauge.Inc(); - label = "err"; - errorsGauge.WithLabels(statusCode.StatusName(), "finally").Inc(); + errorsTotal.WithLabels(statusCode.StatusName()).Inc(); + operationsFailureTotal.Inc(); + operationLatencySeconds.WithLabels("err").Observe(sw.Elapsed.TotalSeconds); } else { - okGauge.Inc(); - label = "ok"; + operationsSuccessTotal.Inc(); + operationLatencySeconds.WithLabels("success").Observe(sw.Elapsed.TotalSeconds); } - - attemptsHistogram.WithLabels(label).Observe(attempts); - latencySummary.WithLabels(label).Observe(sw.ElapsedMilliseconds); }, cancellationTokenSource.Token); } - // ReSharper disable once MethodSupportsCancellation - await Task.Delay(TimeSpan.FromSeconds(runConfig.ShutdownTime)); - - Logger.LogInformation("{ShootingName} shooting is stopped", jobName); + Logger.LogInformation("{ShootingName} shooting is stopped", operationType); }); } } + // private async Task MetricReset(string promPgwEndpoint) + // { + // var deleteUri = $"{promPgwEndpoint}/job/workload-{Job}"; + // using var httpClient = new HttpClient(); + // await httpClient.DeleteAsync(deleteUri); + // } + // return attempt count & StatusCode operation protected abstract Task<(int, StatusCode)> Upsert(T client, string upsertSql, Dictionary parameters, - int writeTimeout, Gauge? errorsGauge = null); + int writeTimeout, Counter? errorsTotal = null); protected abstract Task<(int, StatusCode, object?)> Select(T client, string selectSql, - Dictionary parameters, int readTimeout, Gauge? errorsGauge = null); + Dictionary parameters, int readTimeout, Counter? errorsTotal = null); - private Task<(int, StatusCode)> Upsert(T client, Config config, Gauge? errorsGauge = null) + private Task<(int, StatusCode)> Upsert(T client, Config config, Counter? errorsTotal = null) { const int minSizeStr = 20; const int maxSizeStr = 40; @@ -224,7 +274,7 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string jobName, DECLARE $payload_str AS Utf8; DECLARE $payload_double AS Double; DECLARE $payload_timestamp AS Timestamp; - UPSERT INTO `{config.TableName}` (id, hash, payload_str, payload_double, payload_timestamp) + UPSERT INTO `{config.ResourcePathYdb}` (id, hash, payload_str, payload_double, payload_timestamp) VALUES ($id, Digest::NumericHash($id), $payload_str, $payload_double, $payload_timestamp) """, new Dictionary { @@ -236,33 +286,26 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string jobName, }, { "$payload_double", YdbValue.MakeDouble(Random.Shared.NextDouble()) }, { "$payload_timestamp", YdbValue.MakeTimestamp(DateTime.Now) } - }, config.WriteTimeout, errorsGauge); + }, config.WriteTimeout, errorsTotal); } protected abstract Task CreateClient(Config config); - private async Task<(int, StatusCode)> Select(T client, RunConfig config, Gauge? errorsGauge = null) + private async Task<(int, StatusCode)> Select(T client, RunConfig config, Counter? errorsTotal = null) { var (attempts, code, _) = await Select(client, $""" DECLARE $id AS Int32; SELECT id, payload_str, payload_double, payload_timestamp, payload_hash - FROM `{config.TableName}` WHERE id = $id AND hash = Digest::NumericHash($id) + FROM `{config.ResourcePathYdb}` WHERE id = $id AND hash = Digest::NumericHash($id) """, new Dictionary { { "$id", YdbValue.MakeInt32(Random.Shared.Next(_maxId)) } - }, config.ReadTimeout, errorsGauge); + }, config.ReadTimeout, errorsTotal); return (attempts, code); } - - private async Task MetricReset(string promPgwEndpoint) - { - var deleteUri = $"{promPgwEndpoint}/job/{Job}"; - using var httpClient = new HttpClient(); - await httpClient.DeleteAsync(deleteUri); - } } public static class StatusCodeExtension @@ -272,4 +315,4 @@ public static string StatusName(this StatusCode statusCode) var prefix = statusCode >= StatusCode.ClientTransportResourceExhausted ? "GRPC" : "YDB"; return $"{prefix}_{statusCode}"; } -} \ No newline at end of file +} diff --git a/slo/src/TableService/SloContext.cs b/slo/src/TableService/SloContext.cs index d426ace7..bcfc9d64 100644 --- a/slo/src/TableService/SloContext.cs +++ b/slo/src/TableService/SloContext.cs @@ -10,7 +10,7 @@ namespace TableService; public class SloContext : SloContext { private readonly TxControl _txControl = TxControl.BeginSerializableRW().Commit(); - protected override string Job => "workload-table-service"; + protected override string Job => "TableService"; protected override async Task Create(TableClient client, string createTableSql, int operationTimeout) { @@ -22,7 +22,7 @@ protected override async Task Create(TableClient client, string createTableSql, } protected override async Task<(int, StatusCode)> Upsert(TableClient tableClient, string upsertSql, - Dictionary parameters, int writeTimeout, Gauge? errorsGauge = null) + Dictionary parameters, int writeTimeout, Counter? errorsGauge = null) { var querySettings = new ExecuteDataQuerySettings { OperationTimeout = TimeSpan.FromSeconds(writeTimeout) }; @@ -49,7 +49,7 @@ protected override async Task Create(TableClient client, string createTableSql, } protected override async Task<(int, StatusCode, object?)> Select(TableClient tableClient, string selectSql, - Dictionary parameters, int readTimeout, Gauge? errorsGauge = null) + Dictionary parameters, int readTimeout, Counter? errorsGauge = null) { var querySettings = new ExecuteDataQuerySettings { OperationTimeout = TimeSpan.FromSeconds(readTimeout) }; @@ -81,4 +81,4 @@ protected override async Task CreateClient(Config config) { return new TableClient(await Driver.CreateInitialized(new DriverConfig(config.Endpoint, config.Db), Factory)); } -} \ No newline at end of file +}