diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1c3b8368..e9573fbd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -102,3 +102,76 @@ jobs: - name: Build run: npm run build + + performance-budget: + runs-on: ubuntu-latest + needs: + - frontend-tests + env: + PERF_BUDGET_HEADLESS: 'true' + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 + + - name: Set up Node.js + uses: actions/setup-node@0a44ba78451273a1ed8ac2fee4e347c72dfd377f + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: ./frontend/package-lock.json + + - name: Install dependencies + working-directory: ./frontend + run: npm ci + + - name: Start application stack + run: | + docker compose -f docker-compose.dev.yml up -d --build + + - name: Wait for API + run: | + for i in {1..60}; do curl -sf http://localhost:8000/healthcheck && break || sleep 2; done + + - name: Wait for Frontend + run: | + for i in {1..60}; do curl -sf http://localhost:3000/models/manifest.json && break || sleep 2; done + + - name: Run performance budget checks + working-directory: ./frontend + env: + PERF_BUDGET_OUTPUT_DIR: ../test-results/perf + run: npm run perf:budget + + - name: Upload performance budget report + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 + with: + name: perf-budget + path: test-results/perf + + - name: Shutdown stack + if: always() + run: | + docker compose -f docker-compose.dev.yml down + + observability-budgets: + runs-on: ubuntu-latest + needs: + - performance-budget + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c + with: + python-version: '3.12' + + - name: Install dependencies + run: pip install pyyaml + + - name: Check observability budgets + env: + PROMETHEUS_URL: ${{ secrets.PROMETHEUS_URL }} + PROMETHEUS_BEARER_TOKEN: ${{ secrets.PROMETHEUS_BEARER_TOKEN }} + TEMPO_URL: ${{ secrets.TEMPO_URL }} + TEMPO_BEARER_TOKEN: ${{ secrets.TEMPO_BEARER_TOKEN }} + run: python tools/ci/check_observability_budgets.py --config observability-budgets.yml diff --git a/docs/release-checklist.md b/docs/release-checklist.md new file mode 100644 index 00000000..6849b271 --- /dev/null +++ b/docs/release-checklist.md @@ -0,0 +1,39 @@ +# Release Checklist + +This checklist ties together continuous integration signal, Grafana alerting, and the on-call rotation so that preview deployments are gated on healthy performance and reliability metrics. + +## 1. Verify CI Observability Gates + +- Check the **performance-budget** job in GitHub Actions CI. This job runs the Playwright-based budget defined in `perf-budget.yml` and publishes a JUnit report that Grafana can ingest. If it fails, fix the regression before proceeding. +- Confirm that the **observability-budgets** job has passed. It queries Prometheus and Tempo spanmetrics using `observability-budgets.yml` and fails when P95 latency or error-rate thresholds are exceeded compared to the previous day. +- Export any new failure signatures into the on-call runbook. + +## 2. Review Grafana Dashboards + +- Open the "Configurator Experience" dashboard and confirm the panels for: + - `ci_perf_budget_value` vs `ci_perf_budget_threshold` (pushed from the Playwright budget run). + - Prometheus latency and error-rate panels that use the same queries as the CI job. +- Ensure alert rules are configured to page the on-call engineer whenever the CI metrics breach thresholds for two consecutive runs or when runtime metrics cross the defined budgets. + +## 3. Coordinate On-call Notifications + +- Tag the current on-call engineer in the release Slack channel with a summary of CI and Grafana status. +- Verify PagerDuty (or the configured paging tool) has matching alerts for the Grafana rules referenced above. +- Record the acknowledgement in the release ticket. + +## 4. Gate Preview Environments + +- Do not promote a preview environment until: + - All CI jobs, including `performance-budget` and `observability-budgets`, pass. + - Grafana dashboards show no active alerts for the release window. + - The on-call engineer confirms readiness. +- If any alert is firing, pause the release and create an incident in the on-call tracking tool. + +## 5. Final Release Sign-off + +- Update the release ticket with links to: + - The successful CI run. + - Grafana dashboard screenshots showing green status. + - PagerDuty acknowledgement (or equivalent) from the on-call engineer. +- Archive the Grafana dashboard snapshot for auditability. +- Communicate the release completion to stakeholders. diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 15efa441..b4a4a77d 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -43,13 +43,15 @@ "gltfpack": "0.25.0", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0", + "js-yaml": "^4.1.0", "meshoptimizer": "0.25.0", "postcss": "^8", "tailwindcss": "^3.3.0", "ts-jest": "^29.2.5", "ts-node": "^10.9.2", "typescript": "^5", - "vitest": "^1.6.0" + "vitest": "^1.6.0", + "xmlbuilder2": "^4.0.0" } }, "node_modules/@adobe/css-tools": { @@ -13432,6 +13434,22 @@ "node": ">=12" } }, + "node_modules/xmlbuilder2": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/xmlbuilder2/-/xmlbuilder2-4.0.0.tgz", + "integrity": "sha512-zIoY033NGmbzHX1cYOGKNfeWpZyiGLzXGHNoxQ6tR/R+WqT7mqz+EDtFdPwqnhIms6vHz9BNtMS47DiGPyGfwg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@oozcitak/dom": "^2.0.1", + "@oozcitak/infra": "^2.0.1", + "@oozcitak/util": "^9.0.4", + "js-yaml": "^4.1.0" + }, + "engines": { + "node": ">=20.0" + } + }, "node_modules/xmlchars": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", diff --git a/frontend/package.json b/frontend/package.json index 345a1016..80cd010a 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -13,7 +13,8 @@ "assets:validate": "python ../scripts/glb_validate.py public/models/*.glb --fail-on-warning", "assets:manifest": "python ../scripts/gen_glb_manifest.py > public/models/manifest.json", "assets:all": "npm run assets:gen && npm run assets:pack && npm run assets:validate && npm run assets:manifest", - "test:manifest": "vitest run --reporter=dot" + "test:manifest": "vitest run --reporter=dot", + "perf:budget": "node ./tools/perf/run-perf-budget.js" }, "dependencies": { "@chakra-ui/icons": "^2.1.1", @@ -52,7 +53,10 @@ "ts-jest": "^29.2.5", "ts-node": "^10.9.2", "typescript": "^5", - "vitest": "^1.6.0" + "vitest": "^1.6.0", + "ts-node": "^10.9.2", + "js-yaml": "^4.1.0", + "xmlbuilder2": "^4.0.0" }, "jest": { "setupFilesAfterEnv": [ diff --git a/frontend/tools/perf/run-perf-budget.js b/frontend/tools/perf/run-perf-budget.js new file mode 100755 index 00000000..85b6ba87 --- /dev/null +++ b/frontend/tools/perf/run-perf-budget.js @@ -0,0 +1,422 @@ +#!/usr/bin/env node +const fs = require('fs'); +const path = require('path'); +const os = require('os'); +const yaml = require('js-yaml'); +const { chromium } = require('@playwright/test'); +const { create } = require('xmlbuilder2'); + +const repoRoot = path.resolve(__dirname, '..', '..', '..'); +const frontendRoot = path.resolve(__dirname, '..', '..'); +const perfBudgetPath = process.env.PERF_BUDGET_PATH || path.resolve(repoRoot, 'perf-budget.yml'); +const outputDir = process.env.PERF_BUDGET_OUTPUT_DIR || path.resolve(frontendRoot, 'test-results', 'perf'); +const runCountOverride = process.env.PERF_BUDGET_RUN_COUNT ? Number(process.env.PERF_BUDGET_RUN_COUNT) : null; + +function ensureFileExists(filePath) { + if (!fs.existsSync(filePath)) { + throw new Error(`Performance budget file not found at ${filePath}`); + } +} + +function loadConfig(filePath) { + const raw = fs.readFileSync(filePath, 'utf8'); + const parsed = yaml.load(raw); + if (!parsed || typeof parsed !== 'object') { + throw new Error('Invalid performance budget configuration'); + } + if (!Array.isArray(parsed.pages)) { + throw new Error('Performance budget configuration requires a "pages" array'); + } + return parsed; +} + +function toBytesPerSecond(kbps) { + if (!kbps) { + return undefined; + } + return (kbps * 1024) / 8; +} + +async function applyThrottling(page, throttling) { + if (!throttling) { + return; + } + const client = await page.context().newCDPSession(page); + await client.send('Network.enable'); + if (typeof throttling.downloadThroughputKbps === 'number' || typeof throttling.uploadThroughputKbps === 'number' || typeof throttling.requestLatencyMs === 'number') { + await client.send('Network.emulateNetworkConditions', { + offline: false, + latency: throttling.requestLatencyMs ?? 0, + downloadThroughput: toBytesPerSecond(throttling.downloadThroughputKbps ?? 0), + uploadThroughput: toBytesPerSecond(throttling.uploadThroughputKbps ?? 0), + }); + } + if (typeof throttling.cpuSlowdownMultiplier === 'number') { + await client.send('Emulation.setCPUThrottlingRate', { + rate: Math.max(1, throttling.cpuSlowdownMultiplier), + }); + } +} + +async function performWait(page, waitConfig) { + const timeout = waitConfig.timeout_ms ?? 30000; + switch (waitConfig.type) { + case 'selector': + await page.waitForSelector(waitConfig.selector, { timeout }); + break; + case 'networkidle': + await page.waitForLoadState('networkidle', { timeout }); + if (waitConfig.idle_ms) { + await page.waitForTimeout(waitConfig.idle_ms); + } + break; + case 'timeout': + await page.waitForTimeout(waitConfig.ms ?? 0); + break; + default: + throw new Error(`Unsupported wait type: ${waitConfig.type}`); + } +} + +async function performInteraction(page, interaction) { + const timeout = interaction.timeout_ms ?? 30000; + switch (interaction.type) { + case 'click': { + const waiters = []; + if (interaction.wait_for === 'navigation') { + waiters.push(page.waitForNavigation({ timeout })); + } + const clickPromise = page.click(interaction.selector, { + timeout, + button: interaction.button || 'left', + }); + waiters.push(clickPromise); + await Promise.all(waiters); + if (interaction.wait_for === 'networkidle') { + await page.waitForLoadState('networkidle', { timeout }); + } + break; + } + case 'press': { + await page.keyboard.press(interaction.key, { timeout }); + break; + } + case 'type': { + await page.fill(interaction.selector, interaction.value ?? '', { timeout }); + break; + } + default: + throw new Error(`Unsupported interaction type: ${interaction.type}`); + } +} + +async function runPageScenario(browser, pageConfig, globalConfig) { + const context = await browser.newContext({ ignoreHTTPSErrors: true }); + const page = await context.newPage(); + + await page.addInitScript(() => { + window.__perfMetrics = { + lcp: null, + cls: 0, + totalBlockingTime: 0, + navigationDuration: null, + firstPaint: null, + firstContentfulPaint: null, + }; + + try { + new PerformanceObserver((entryList) => { + const entries = entryList.getEntries(); + if (entries.length) { + const last = entries[entries.length - 1]; + window.__perfMetrics.lcp = last.startTime; + } + }).observe({ type: 'largest-contentful-paint', buffered: true }); + } catch (error) { + console.warn('LCP observer unavailable', error); + } + + try { + new PerformanceObserver((entryList) => { + for (const entry of entryList.getEntries()) { + if (!entry.hadRecentInput) { + window.__perfMetrics.cls += entry.value; + } + } + }).observe({ type: 'layout-shift', buffered: true }); + } catch (error) { + console.warn('CLS observer unavailable', error); + } + + try { + new PerformanceObserver((entryList) => { + for (const entry of entryList.getEntries()) { + const blocking = entry.duration - 50; + if (blocking > 0) { + window.__perfMetrics.totalBlockingTime += blocking; + } + } + }).observe({ type: 'longtask', buffered: true }); + } catch (error) { + console.warn('Long task observer unavailable', error); + } + + window.addEventListener('load', () => { + const navEntries = performance.getEntriesByType('navigation'); + if (navEntries.length) { + window.__perfMetrics.navigationDuration = navEntries[navEntries.length - 1].duration; + } + const paints = performance.getEntriesByType('paint'); + for (const entry of paints) { + if (entry.name === 'first-contentful-paint') { + window.__perfMetrics.firstContentfulPaint = entry.startTime; + } + if (entry.name === 'first-paint') { + window.__perfMetrics.firstPaint = entry.startTime; + } + } + }); + }); + + try { + await applyThrottling(page, globalConfig.throttling); + await page.goto(pageConfig.url, { waitUntil: 'load', timeout: pageConfig.timeout_ms ?? 60000 }); + + if (Array.isArray(pageConfig.waits)) { + for (const waitConfig of pageConfig.waits) { + await performWait(page, waitConfig); + } + } + + if (Array.isArray(pageConfig.interactions)) { + for (const interaction of pageConfig.interactions) { + await performInteraction(page, interaction); + } + } + + if (Array.isArray(pageConfig.post_interaction_waits)) { + for (const waitConfig of pageConfig.post_interaction_waits) { + await performWait(page, waitConfig); + } + } + + const collected = await page.evaluate(() => { + const result = {}; + const paints = performance.getEntriesByType('paint'); + const navEntries = performance.getEntriesByType('navigation'); + const fcpEntry = paints.find((entry) => entry.name === 'first-contentful-paint'); + const firstPaintEntry = paints.find((entry) => entry.name === 'first-paint'); + const navEntry = navEntries.length ? navEntries[navEntries.length - 1] : null; + const metrics = window.__perfMetrics || {}; + const firstPaint = metrics.firstPaint ?? firstPaintEntry?.startTime ?? null; + const fcp = metrics.firstContentfulPaint ?? fcpEntry?.startTime ?? null; + const lcp = metrics.lcp ?? null; + const navDuration = metrics.navigationDuration ?? navEntry?.duration ?? null; + let speedIndex = null; + if (lcp !== null || fcp !== null || firstPaint !== null) { + const samples = [firstPaint, fcp, lcp].filter((value) => typeof value === 'number'); + if (samples.length) { + const sum = samples.reduce((acc, value) => acc + value, 0); + speedIndex = sum / samples.length; + } + } + result['first-contentful-paint'] = typeof fcp === 'number' ? fcp : null; + result['largest-contentful-paint'] = typeof lcp === 'number' ? lcp : null; + result['cumulative-layout-shift'] = typeof metrics.cls === 'number' ? metrics.cls : null; + result['total-blocking-time'] = typeof metrics.totalBlockingTime === 'number' ? metrics.totalBlockingTime : null; + result['speed-index'] = typeof speedIndex === 'number' ? speedIndex : null; + result['navigation-duration'] = typeof navDuration === 'number' ? navDuration : null; + return result; + }); + + return collected; + } finally { + await context.close(); + } +} + +function percentile(values, percentileValue) { + if (!values.length) { + return null; + } + const sorted = [...values].sort((a, b) => a - b); + if (percentileValue <= 0) { + return sorted[0]; + } + if (percentileValue >= 1) { + return sorted[sorted.length - 1]; + } + const index = (sorted.length - 1) * percentileValue; + const lower = Math.floor(index); + const upper = Math.ceil(index); + if (lower === upper) { + return sorted[lower]; + } + const weight = index - lower; + return sorted[lower] * (1 - weight) + sorted[upper] * weight; +} + +function aggregate(values, aggregation) { + const filtered = values.filter((value) => typeof value === 'number' && Number.isFinite(value)); + if (!filtered.length) { + return null; + } + if (aggregation === 'mean') { + const sum = filtered.reduce((acc, value) => acc + value, 0); + return sum / filtered.length; + } + const percentileMatch = /^p(\d{1,3})$/i.exec(aggregation); + if (percentileMatch) { + const percentileNumber = Number(percentileMatch[1]); + if (percentileNumber < 0 || percentileNumber > 100) { + throw new Error(`Invalid percentile aggregation: ${aggregation}`); + } + return percentile(filtered, percentileNumber / 100); + } + throw new Error(`Unsupported aggregation method: ${aggregation}`); +} + +function buildJUnitSuite(results) { + const doc = create({ version: '1.0', encoding: 'UTF-8' }).ele('testsuites'); + const suite = doc.ele('testsuite', { name: 'Performance Budgets', tests: 0, failures: 0 }); + + let totalTests = 0; + let totalFailures = 0; + + for (const pageResult of results) { + for (const metric of pageResult.metrics) { + totalTests += 1; + const testCase = suite.ele('testcase', { + classname: `perf.${pageResult.id}`, + name: `${metric.id} (${metric.aggregation})`, + }); + const message = `measured=${metric.value?.toFixed ? metric.value.toFixed(2) : metric.value} threshold=${metric.threshold}`; + testCase.ele('system-out').txt(message); + if (!metric.pass) { + totalFailures += 1; + testCase.ele('failure', { message: `Threshold exceeded for ${metric.id}` }).txt(message); + } + } + } + + suite.att('tests', totalTests); + suite.att('failures', totalFailures); + + return doc.end({ prettyPrint: true }); +} + +async function pushToPushgateway(results) { + const pushgatewayUrl = process.env.PUSHGATEWAY_URL; + if (!pushgatewayUrl) { + return; + } + const job = process.env.PUSHGATEWAY_JOB || 'ci_perf_budget'; + const instance = process.env.PUSHGATEWAY_INSTANCE || os.hostname(); + const url = `${pushgatewayUrl.replace(/\/$/, '')}/metrics/job/${encodeURIComponent(job)}/instance/${encodeURIComponent(instance)}`; + + const lines = ['# TYPE ci_perf_budget_value gauge', '# TYPE ci_perf_budget_threshold gauge']; + + for (const pageResult of results) { + for (const metric of pageResult.metrics) { + if (typeof metric.value === 'number') { + lines.push( + `ci_perf_budget_value{page="${pageResult.id}",metric="${metric.id}",aggregation="${metric.aggregation}"} ${metric.value}`, + ); + } + lines.push( + `ci_perf_budget_threshold{page="${pageResult.id}",metric="${metric.id}",aggregation="${metric.aggregation}"} ${metric.threshold}`, + ); + } + } + + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'text/plain' }, + body: `${lines.join('\n')}\n`, + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`Pushgateway responded with status ${response.status}: ${text}`); + } +} + +async function main() { + ensureFileExists(perfBudgetPath); + const config = loadConfig(perfBudgetPath); + if (!config.pages.length) { + console.log('No pages defined in performance budget. Nothing to run.'); + return; + } + + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + const runCount = runCountOverride || config.run_count || 1; + const browser = await chromium.launch({ headless: process.env.PERF_BUDGET_HEADLESS !== 'false' }); + const summary = []; + let overallPass = true; + + try { + for (const pageConfig of config.pages) { + console.log(`\nRunning performance budget for ${pageConfig.id}`); + const runMetrics = []; + for (let i = 0; i < runCount; i += 1) { + console.log(` Run ${i + 1}/${runCount}`); + const metrics = await runPageScenario(browser, pageConfig, config); + runMetrics.push(metrics); + } + + const metricResults = []; + for (const metricConfig of pageConfig.metrics || []) { + const values = runMetrics + .map((metrics) => metrics[metricConfig.id]) + .filter((value) => typeof value === 'number' && Number.isFinite(value)); + const aggregateValue = aggregate(values, metricConfig.aggregation); + const pass = typeof aggregateValue === 'number' && aggregateValue <= metricConfig.threshold; + if (!pass) { + overallPass = false; + } + metricResults.push({ + id: metricConfig.id, + aggregation: metricConfig.aggregation, + threshold: metricConfig.threshold, + unit: metricConfig.unit, + value: aggregateValue, + pass, + samples: values, + }); + const valueText = typeof aggregateValue === 'number' ? aggregateValue.toFixed(2) : 'n/a'; + console.log( + ` ${metricConfig.id} (${metricConfig.aggregation}): ${valueText}${metricConfig.unit ? ` ${metricConfig.unit}` : ''} (threshold ${metricConfig.threshold})`, + ); + } + + summary.push({ + id: pageConfig.id, + metrics: metricResults, + }); + } + } finally { + await browser.close(); + } + + const junitXml = buildJUnitSuite(summary); + const junitPath = path.resolve(outputDir, 'perf-budget-junit.xml'); + fs.writeFileSync(junitPath, junitXml, 'utf8'); + console.log(`JUnit results written to ${junitPath}`); + + await pushToPushgateway(summary).catch((error) => { + console.warn(`Failed to push metrics to Pushgateway: ${error.message}`); + }); + + if (!overallPass) { + throw new Error('Performance budget thresholds were exceeded. See logs for details.'); + } +} + +main().catch((error) => { + console.error(error); + process.exitCode = 1; +}); diff --git a/mkdocs.yml b/mkdocs.yml index f60b23be..71ebe479 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,6 +39,7 @@ nav: - AI Engine: AI_ENGINE.md - GLB Asset Standard: GLB_ASSET_STANDARD.md - Template UI: template-preview.md + - Release Checklist: release-checklist.md plugins: - search - include-markdown diff --git a/observability-budgets.yml b/observability-budgets.yml new file mode 100644 index 00000000..9cad6072 --- /dev/null +++ b/observability-budgets.yml @@ -0,0 +1,30 @@ +window: 15m +baseline: + offset: 24h + allowed_regression_pct: 20 +prometheus: + base_url_env: PROMETHEUS_URL + auth_token_env: PROMETHEUS_BEARER_TOKEN + metrics: + - id: api_p95_latency + description: "P95 API latency across all HTTP handlers" + query: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"paform-api\",route!~\"/healthcheck\"}[5m])) by (le)) + unit: ms + threshold: 3000 + scale: 1000 + - id: api_error_rate + description: "5xx error rate for the API" + query: sum(rate(http_requests_total{service=\"paform-api\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"paform-api\"}[5m])) + unit: ratio + threshold: 0.02 + max_increase_pct: 10 +tempo: + base_url_env: TEMPO_URL + auth_token_env: TEMPO_BEARER_TOKEN + metrics: + - id: configurator_trace_p95 + description: "Configurator P95 span latency from Tempo spanmetrics" + query: histogram_quantile(0.95, sum(rate(tempo_spanmetrics_latency_bucket{service=\"paform-frontend\",route=\"/configurator\"}[5m])) by (le)) + unit: ms + threshold: 4000 + scale: 1000 diff --git a/perf-budget.yml b/perf-budget.yml index 84d55db4..22050c33 100644 --- a/perf-budget.yml +++ b/perf-budget.yml @@ -1,10 +1,70 @@ -targetRoute: "/configurator" -selectorWhenReady: "canvas" -p90BudgetMs: 3500 -warmupRuns: 0 -sampleRuns: 3 -trace: - - type: click - selector: "button:has-text(\"Generate Quote\")" - - type: waitForSelector - selector: "text=Total:" +version: 1 +run_count: 3 +throttling: + profile: slow-4g + cpuSlowdownMultiplier: 4 + downloadThroughputKbps: 1500 + uploadThroughputKbps: 750 + requestLatencyMs: 40 +pages: + - id: configurator + url: http://localhost:3000/configurator + waits: + - type: selector + selector: "canvas" + timeout_ms: 60000 + - type: networkidle + idle_ms: 5000 + timeout_ms: 60000 + selectors: + viewer_canvas: "canvas" + generate_button: "button:has-text(\"Generate Quote\")" + price_total: "text=Total:" + metrics: + - id: first-contentful-paint + aggregation: p75 + threshold: 2000 + unit: ms + - id: largest-contentful-paint + aggregation: p75 + threshold: 3500 + unit: ms + - id: speed-index + aggregation: p75 + threshold: 3200 + unit: ms + - id: total-blocking-time + aggregation: p75 + threshold: 200 + unit: ms + - id: cumulative-layout-shift + aggregation: p75 + threshold: 0.1 + unit: score + - id: homepage-to-configurator + url: http://localhost:3000/ + waits: + - type: selector + selector: "a[href='/configurator']" + timeout_ms: 60000 + interactions: + - type: click + selector: "a[href='/configurator']" + wait_for: networkidle + timeout_ms: 60000 + post_interaction_waits: + - type: selector + selector: "canvas" + timeout_ms: 60000 + - type: networkidle + idle_ms: 5000 + timeout_ms: 60000 + metrics: + - id: navigation-duration + aggregation: p90 + threshold: 3000 + unit: ms + - id: largest-contentful-paint + aggregation: p95 + threshold: 4000 + unit: ms diff --git a/tools/ci/check_observability_budgets.py b/tools/ci/check_observability_budgets.py new file mode 100755 index 00000000..324ef913 --- /dev/null +++ b/tools/ci/check_observability_budgets.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +"""Check Prometheus and Tempo observability budgets for regressions.""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional +from urllib import parse, request + +import yaml + + +@dataclass +class MetricResult: + provider: str + metric_id: str + description: str + unit: str + threshold: float + value: Optional[float] + baseline: Optional[float] + passed_threshold: bool + passed_regression: bool + + +def parse_duration(duration: str) -> float: + units = { + 's': 1, + 'm': 60, + 'h': 3600, + 'd': 86400, + } + duration = duration.strip() + if duration.isdigit(): + return float(duration) + suffix = duration[-1] + if suffix not in units: + raise ValueError(f"Unsupported duration format: {duration}") + value = float(duration[:-1]) + return value * units[suffix] + + +def load_config(path: str) -> Dict: + with open(path, 'r', encoding='utf8') as handle: + return yaml.safe_load(handle) + + +def build_query_url(base_url: str, query: str, timestamp: Optional[float]) -> str: + params = {'query': query} + if timestamp is not None: + params['time'] = f"{timestamp:.3f}" + encoded = parse.urlencode(params) + return f"{base_url.rstrip('/')}/api/v1/query?{encoded}" + + +def execute_query(base_url: str, query: str, auth_token: Optional[str], timestamp: Optional[float]) -> Optional[float]: + sample_path_env = os.getenv('OBSERVABILITY_SAMPLE_FILE') + if sample_path_env: + with open(sample_path_env, 'r', encoding='utf8') as handle: + sample_data = json.load(handle) + provider_data = sample_data.get(base_url) + if provider_data is None: + return None + metric_data = provider_data.get(query) + if metric_data is None: + return None + return float(metric_data.get('value')) + + url = build_query_url(base_url, query, timestamp) + headers = {'Accept': 'application/json'} + if auth_token: + headers['Authorization'] = f"Bearer {auth_token}" + req = request.Request(url, headers=headers) + try: + with request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode('utf8')) + except Exception as exc: # noqa: BLE001 - we want to show full error to CI logs + raise RuntimeError(f"Failed to query {url}: {exc}") from exc + + if payload.get('status') != 'success': + raise RuntimeError(f"Query failed for {url}: {payload}") + + results = payload.get('data', {}).get('result', []) + if not results: + return None + # Prometheus returns [timestamp, value] + value = results[0].get('value') + if not value or len(value) < 2: + return None + try: + return float(value[1]) + except (TypeError, ValueError) as exc: # noqa: PERF203 - explicit error context + raise RuntimeError(f"Unexpected value returned for query {query}: {value}") from exc + + +def evaluate_metrics( + provider_name: str, + provider_config: Dict, + window_seconds: float, + baseline_offset_seconds: float, + default_regression_pct: float, +) -> Iterable[MetricResult]: + base_url_env = provider_config.get('base_url_env') + if not base_url_env: + raise ValueError(f"Provider {provider_name} missing base_url_env configuration") + base_url = os.getenv(base_url_env, '').strip() + if not base_url: + print(f"::notice ::Skipping {provider_name} metrics because {base_url_env} is not set") + return [] + + auth_token_env = provider_config.get('auth_token_env') + auth_token = os.getenv(auth_token_env, '').strip() if auth_token_env else None + + metrics = provider_config.get('metrics', []) + now = time.time() + results: List[MetricResult] = [] + + for metric in metrics: + metric_id = metric.get('id') + if not metric_id: + print(f"::warning ::Skipping unnamed metric in {provider_name} configuration") + continue + query = metric.get('query') + if not query: + print(f"::warning ::Metric {metric_id} missing query; skipping") + continue + unit = metric.get('unit', '') + description = metric.get('description', '') + threshold = float(metric.get('threshold')) + scale = float(metric.get('scale', 1.0)) + regression_pct = float(metric.get('max_increase_pct', default_regression_pct)) + + current_value = execute_query(base_url, query, auth_token, timestamp=None) + baseline_value = execute_query(base_url, query, auth_token, timestamp=now - baseline_offset_seconds) + + if current_value is not None: + current_value *= scale + if baseline_value is not None: + baseline_value *= scale + + passed_threshold = current_value is not None and current_value <= threshold + passed_regression = True + if ( + baseline_value is not None + and current_value is not None + and regression_pct is not None + ): + allowed = baseline_value * (1 + regression_pct / 100) + passed_regression = current_value <= allowed + + results.append( + MetricResult( + provider=provider_name, + metric_id=metric_id, + description=description, + unit=unit, + threshold=threshold, + value=current_value, + baseline=baseline_value, + passed_threshold=passed_threshold, + passed_regression=passed_regression, + ), + ) + + return results + + +def main(argv: Optional[List[str]] = None) -> int: + parser = argparse.ArgumentParser(description='Check observability budgets against Prometheus/Tempo.') + parser.add_argument('--config', default='observability-budgets.yml', help='Path to the observability budget YAML configuration.') + args = parser.parse_args(argv) + + config = load_config(args.config) + window = config.get('window', '15m') + window_seconds = parse_duration(window) + baseline_config = config.get('baseline', {}) + baseline_offset = parse_duration(baseline_config.get('offset', '24h')) + regression_pct = float(baseline_config.get('allowed_regression_pct', 20)) + + providers = [] + for provider_name in ('prometheus', 'tempo'): + provider_config = config.get(provider_name) + if provider_config: + providers.append((provider_name, provider_config)) + + if not providers: + print('No observability providers configured.') + return 0 + + results: List[MetricResult] = [] + for provider_name, provider_config in providers: + provider_results = list( + evaluate_metrics(provider_name, provider_config, window_seconds, baseline_offset, regression_pct), + ) + results.extend(provider_results) + + if not results: + print('No observability metrics were evaluated.') + return 0 + + failures = [ + result + for result in results + if not (result.passed_threshold and result.passed_regression) + ] + + print('Observability budget summary:') + for result in results: + value_display = 'n/a' if result.value is None else f"{result.value:.4f}" + baseline_display = 'n/a' if result.baseline is None else f"{result.baseline:.4f}" + status_parts = [] + status_parts.append('threshold OK' if result.passed_threshold else 'threshold FAIL') + status_parts.append('regression OK' if result.passed_regression else 'regression FAIL') + status = ', '.join(status_parts) + print( + f" [{result.provider}] {result.metric_id}: value={value_display}{result.unit} " + f"baseline={baseline_display}{result.unit} threshold<={result.threshold}{result.unit} -> {status}", + ) + if result.description: + print(f" {result.description}") + + if failures: + print('\nThe following observability budgets failed:') + for failure in failures: + print(f" - {failure.provider}:{failure.metric_id} (value={failure.value}, threshold={failure.threshold})") + return 1 + + print('\nAll observability budgets are within thresholds.') + return 0 + + +if __name__ == '__main__': + sys.exit(main())