Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 72 additions & 1 deletion tools/perf/check-canary-metrics.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,23 @@


def exit_success(message: str) -> None:
"""
Prints a message to standard output and exits the process with status code 0.

Parameters:
message (str): Text to print before exiting.
"""
print(message)
sys.exit(0)


def exit_failure(message: str) -> None:
"""
Prints an error message to standard error and terminates the process with exit code 1.

Parameters:
message (str): Error message to emit to stderr before exiting.
"""
print(message, file=sys.stderr)
sys.exit(1)

Expand All @@ -56,6 +68,15 @@ def exit_failure(message: str) -> None:


def query_prometheus(url: str, query: str) -> Optional[float]:
"""
Query a Prometheus HTTP API for an instant vector and extract the first numeric value.

Returns:
The numeric value from the first result row as a float, or `None` if Prometheus returned no results.

Raises:
RuntimeError: If the HTTP request fails, Prometheus responds with a non-"success" status, or the response payload has an unexpected format.
"""
encoded_query = urllib.parse.urlencode({'query': query})
endpoint = f"{url.rstrip('/')}/api/v1/query?{encoded_query}"
try:
Expand All @@ -77,6 +98,16 @@ def query_prometheus(url: str, query: str) -> Optional[float]:


def render_query(template: str, build: str) -> str:
"""
Substitutes the `$BUILD` placeholder in a query template with the provided build identifier.

Parameters:
template (str): Prometheus query template containing the `$BUILD` placeholder.
build (str): Build identifier to substitute into the template.

Returns:
rendered (str): The query string with `$BUILD` replaced by `build`.
"""
return template.replace('$BUILD', build)


Expand Down Expand Up @@ -104,6 +135,20 @@ def evaluate_metric(
unit: str,
description: str,
) -> MetricResult:
"""
Evaluate a single canary metric against an absolute threshold and an optional regression allowance.

Parameters:
name (str): Human-readable metric identifier used in messages and the resulting MetricResult.name.
template (str): Prometheus query template; the placeholder `$BUILD` will be replaced with the build SHA.
threshold (float): Absolute budget value the current metric must be less than or equal to.
regression_pct (float): Allowed relative increase over the previous build (e.g., 0.1 for 10%); ignored when previous baseline is near zero or unavailable.
unit (str): Unit string appended to numeric values in human-readable messages (e.g., "ms", or empty).
description (str): Short textual description stored on the MetricResult.description for reporting.

Returns:
MetricResult: Populated result containing current and optional previous values, threshold and regression parameters, a pass/fail flag, and a human-readable message explaining the outcome.
"""
current_query = render_query(template, CURRENT_BUILD)
current_value = query_prometheus(PROMETHEUS_URL, current_query)
previous_value = None
Expand Down Expand Up @@ -161,6 +206,15 @@ def evaluate_metric(


def maybe_check_tempo() -> Optional[MetricResult]:
"""
Check Tempo for traces slower than the configured threshold for the canary service.

If TEMPO_URL is unset the function prints a skip message and returns None. Otherwise it queries Tempo for any trace with duration at or above TEMPO_SLOW_TRACE_THRESHOLD_MS within TEMPO_LOOKBACK_SECONDS for TEMPO_SERVICE and produces a MetricResult summarizing whether slow traces were found.

Returns:
MetricResult: a result named 'tempo_slow_traces' where `passed` is `False` if a slow trace was found and `current_value` is the slow-threshold in milliseconds (or `0.0` if none was found).
None: if TEMPO_URL is not configured.
"""
if not TEMPO_URL:
print('TEMPO_URL not provided; skipping trace regression checks.')
return None
Expand Down Expand Up @@ -237,11 +291,28 @@ def maybe_check_tempo() -> Optional[MetricResult]:


def write_junit(results_list: list[MetricResult]) -> None:
"""
Write a JUnit-format XML report summarizing the provided MetricResult entries and save it to the configured artifacts location.

Each MetricResult becomes a <testcase>; passing results include a <system-out> block with metric details, failing results include a <failure> element and the same details. The file is written to RESULT_DIR/JUNIT_FILENAME and a message with the written path is printed.

Parameters:
results_list (list[MetricResult]): List of metric results to include in the JUnit report.
"""
tests = len(results_list)
failures_count = len([result for result in results_list if not result.passed])
suite_name = 'canary-budget'

def xml_escape(value: str) -> str:
"""
Escape characters in a string so it is safe to include in XML content.

Parameters:
value (str): The raw string to escape.

Returns:
str: The input string with XML special characters replaced by their entity equivalents (`&amp;`, `&quot;`, `&apos;`, `&lt;`, `&gt;`).
"""
return (
value.replace('&', '&amp;')
.replace('"', '&quot;')
Expand Down Expand Up @@ -292,4 +363,4 @@ def xml_escape(value: str) -> str:
if failures:
exit_failure('Canary metrics exceeded budgets; see log for details.')

exit_success('Canary metrics are within budgets.')
exit_success('Canary metrics are within budgets.')
Loading