chore(ci): Basic slo breach prototype

bric3 · bric3 · commit fdd30e5e6a7b · 2025-07-11T11:35:05.000+02:00
diff --git a/.gitlab/benchmarks/bp-runner.fail-on-breach.yml b/.gitlab/benchmarks/bp-runner.fail-on-breach.yml
@@ -0,0 +1,46 @@
+# Example of measurements can be seen here:
+#  https://benchmarking.us1.prod.dog/benchmarks?benchmarkGroupPipelineId=66629462&benchmarkGroupSha=16054515e292a66c5eaf79b9ea62df6f348cd67e&page=1&ciJobDateStart=1746309551994&ciJobDateEnd=1748901551994&benchmarkId=14167634
+
+# Thresholds set based on guidance in https://datadoghq.atlassian.net/wiki/spaces/APMINT/pages/5070193198/How+to+set+up+pre-release+performance+quality+gates#How-to-choose-thresholds-for-pre-release-gates%3F
+
+experiments:
+  - name: Run SLO breach check
+    steps:
+      - name: SLO breach check
+        run: fail_on_breach
+        # https://datadoghq.atlassian.net/wiki/x/LgI1LgE#How-to-choose-a-warning-range-for-pre-release-gates%3F
+        warning_range: 10
+        # File spec
+        #   https://datadoghq.atlassian.net/wiki/x/LgI1LgE#Specification
+        # Measurements
+        #   https://benchmarking.us1.prod.dog/trends?projectId=4&branch=master&trendsTab=per_scenario
+        scenarios:
+
+          # Standard macrobenchmarks
+          - name: normal_operation/only-tracing
+            thresholds:
+              - agg_http_req_duration_p50 < 2.28 ms
+          - name: normal_operation/only-tracing
+            thresholds:
+              - agg_http_req_duration_p99 < 7.45 ms
+          - name: normal_operation/otel-latest
+            thresholds:
+              - agg_http_req_duration_p50 < 2.28 ms
+          - name: normal_operation/otel-latest
+            thresholds:
+              - agg_http_req_duration_p99 < 9.00 ms
+
+          - name: high_load/only-tracing
+            thresholds:
+              - throughput > 1400.0 op/s
+          - name: high_load/otel-latest
+            thresholds:
+              - throughput > 1400.0 op/s
+
+          # Startup macrobenchmarks
+          - name: "startup:petclinic:(tracing|appsec|iast):GlobalTracer"
+            thresholds:
+              - execution_time < 245 ms
+          - name: "startup:petclinic:profiling:GlobalTracer"
+            thresholds:
+              - execution_time < 368 ms
diff --git a/.gitlab/macrobenchmarks.yml b/.gitlab/macrobenchmarks.yml
@@ -1,3 +1,8 @@
+include:
+  project: 'DataDog/benchmarking-platform-tools'
+  file: 'images/templates/gitlab/notify-slo-breaches.template.yml'
+  ref: '925e0a3e7dd628885f6fc69cdaea5c8cc9e212bc'
+
 .macrobenchmarks:
   stage: macrobenchmarks
   rules:
@@ -68,3 +73,34 @@ otel-latest:
     BP_BENCHMARKS_CONFIGURATION: otel-latest
     TRACER_OPTS: -javaagent:/app/otel-java-agent.jar -Ddd.env=otel-latest -Ddd.service=bp-java-petclinic
     JAVA_OPTS: -javaagent:/app/memcheck/stability-testing-memwatch.jar -Xmx128M
+
+
+check-slo-breaches:
+  stage: macrobenchmarks
+  when: always
+  tags: ["arch:amd64"]
+  image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:latest
+  needs:
+    - job: baseline
+      artifacts: true
+    - job: only-tracing
+      artifacts: true
+    - job: otel-latest
+      artifacts: true
+  artifacts:
+    name: "artifacts"
+    when: always
+    paths:
+      - platform/artifacts/
+    expire_in: 3 months
+  script:
+    - export ARTIFACTS_DIR="$(pwd)/platform/artifacts/"
+    - bp-runner .gitlab/benchmarks/bp-runner.fail-on-breach.yml
+
+notify-slo-breaches:
+  extends: .notify-slo-breaches
+  stage: macrobenchmarks
+  needs: ["check-slo-breaches"]
+  when: always
+  variables:
+    CHANNEL: "apm-release-platform"