diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 8330f62f..f7cc2a0f 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -30,6 +30,7 @@ "@testing-library/react": "^15.0.7", "@testing-library/user-event": "^14.5.2", "@types/jest": "^29.5.12", + "@types/js-yaml": "^4.0.9", "@types/node": "^20", "@types/react": "^18", "@types/react-dom": "^18", @@ -3267,6 +3268,13 @@ "pretty-format": "^29.0.0" } }, + "node_modules/@types/js-yaml": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", + "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/jsdom": { "version": "20.0.1", "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-20.0.1.tgz", diff --git a/frontend/package.json b/frontend/package.json index 766e2c42..2d77bc26 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -42,6 +42,7 @@ "@types/node": "^20", "@types/react": "^18", "@types/react-dom": "^18", + "@types/js-yaml": "^4.0.9", "autoprefixer": "^10.0.1", "eslint": "^8", "eslint-config-next": "14.2.0", diff --git a/frontend/tests/e2e/perf/perf-budget.spec.ts b/frontend/tests/e2e/perf/perf-budget.spec.ts new file mode 100644 index 00000000..225086b8 --- /dev/null +++ b/frontend/tests/e2e/perf/perf-budget.spec.ts @@ -0,0 +1,277 @@ +import { expect, test } from '@playwright/test'; +import type { Browser } from '@playwright/test'; +import fs from 'node:fs'; +import path from 'node:path'; +import yaml from 'js-yaml'; + +type WaitConfig = { + type: 'selector' | 'networkidle'; + selector?: string; + timeout_ms?: number; + idle_ms?: number; +}; + +type MetricConfig = { + id: string; + aggregation: string; + threshold: number; + unit: string; +}; + +type JourneyConfig = { + id: string; + description?: string; + from_url: string; + to_url: string; + waits?: WaitConfig[]; + metrics?: MetricConfig[]; +}; + +type PerfBudgetConfig = { + run_count?: number; + journeys?: JourneyConfig[]; +}; + +type JourneyMetricSummary = { + quantile: string; + value_ms: number; + threshold_ms?: number; + passed: boolean; +}; + +type JourneyResult = { + id: string; + description?: string; + runs: Array<{ + navigation_duration_ms: number; + largest_contentful_paint_ms: number; + }>; + aggregated: { + navigation_duration?: JourneyMetricSummary; + largest_contentful_paint?: JourneyMetricSummary; + }; +}; + +const percentile = (values: number[], percentileValue: number): number => { + if (!values.length) { + return 0; + } + const sorted = [...values].sort((a, b) => a - b); + const rank = (sorted.length - 1) * percentileValue; + const lowerIndex = Math.floor(rank); + const upperIndex = Math.ceil(rank); + if (lowerIndex === upperIndex) { + return sorted[lowerIndex]; + } + const weight = rank - lowerIndex; + return sorted[lowerIndex] + (sorted[upperIndex] - sorted[lowerIndex]) * weight; +}; + +const loadPerfBudget = (): PerfBudgetConfig => { + const configPath = process.env.PERF_BUDGET_FILE + ? path.resolve(process.cwd(), process.env.PERF_BUDGET_FILE) + : path.resolve(__dirname, '../../../../perf-budget.yml'); + + const raw = fs.readFileSync(configPath, 'utf8'); + return yaml.load(raw) as PerfBudgetConfig; +}; + +const toQuantile = (aggregation: string): number | undefined => { + const match = /^p(\d{1,2})$/i.exec(aggregation.trim()); + if (!match) { + return undefined; + } + const percentileNumber = Number(match[1]); + return percentileNumber / 100; +}; + +const ensureResultsDir = (): string => { + const dir = process.env.PERF_RESULTS_DIR + ? path.resolve(process.cwd(), process.env.PERF_RESULTS_DIR) + : path.resolve(__dirname, '../../../../perf-results'); + fs.mkdirSync(dir, { recursive: true }); + return dir; +}; + +const writeJourneyResult = (dir: string, result: JourneyResult): void => { + const filePath = path.join(dir, `${result.id}.json`); + fs.writeFileSync(filePath, JSON.stringify(result, null, 2), 'utf8'); +}; + +const attachJourneyResult = async (result: JourneyResult): Promise => { + await test.info().attach(`${result.id}-metrics.json`, { + body: Buffer.from(JSON.stringify(result, null, 2)), + contentType: 'application/json', + }); +}; + +const collectNavigationMetrics = async ( + browser: Browser, + fromUrl: string, + toUrl: string, + waits: WaitConfig[] | undefined, +): Promise<{ navigation: number; lcp: number }> => { + const context = await browser.newContext(); + + await context.addInitScript(() => { + (window as typeof window & { + __lcpEntries?: PerformanceEntry[]; + __lcpObserver?: PerformanceObserver; + }).__lcpEntries = []; + }); + + const page = await context.newPage(); + await page.goto(fromUrl, { waitUntil: 'networkidle' }); + + await page.evaluate(() => { + const globalWindow = window as typeof window & { + __lcpEntries?: PerformanceEntry[]; + __lcpObserver?: PerformanceObserver; + }; + + globalWindow.__lcpEntries = []; + if (globalWindow.__lcpObserver) { + globalWindow.__lcpObserver.disconnect(); + } + globalWindow.__lcpObserver = new PerformanceObserver((entryList) => { + const entries = entryList.getEntries(); + globalWindow.__lcpEntries = [ + ...(globalWindow.__lcpEntries ?? []), + ...entries, + ]; + }); + globalWindow.__lcpObserver.observe({ type: 'largest-contentful-paint', buffered: true }); + + performance.clearResourceTimings(); + performance.clearMeasures(); + performance.clearMarks(); + }); + + await page.goto(toUrl, { waitUntil: 'networkidle' }); + + if (waits) { + for (const wait of waits) { + if (wait.type === 'selector' && wait.selector) { + await page.waitForSelector(wait.selector, { + state: 'attached', + timeout: wait.timeout_ms ?? 30_000, + }); + } + if (wait.type === 'networkidle') { + await page.waitForLoadState('networkidle', { timeout: wait.timeout_ms ?? 30_000 }); + if (wait.idle_ms) { + await page.waitForTimeout(wait.idle_ms); + } + } + } + } + + await page.waitForTimeout(500); + + const navigationDuration = await page.evaluate(() => { + const entries = performance.getEntriesByType('navigation'); + const last = entries[entries.length - 1] as PerformanceNavigationTiming | undefined; + return last ? last.duration : Number.NaN; + }); + + const lcp = await page.evaluate(() => { + const globalWindow = window as typeof window & { + __lcpEntries?: PerformanceEntry[]; + }; + const entries = (globalWindow.__lcpEntries ?? []) as Array; + if (!entries.length) { + return Number.NaN; + } + return entries.reduce((acc, entry) => { + const candidate = entry.startTime ?? entry.renderTime ?? entry.loadTime ?? 0; + return Math.max(acc, candidate); + }, 0); + }); + + await context.close(); + + return { navigation: navigationDuration, lcp }; +}; + +const perfConfig = loadPerfBudget(); +const runCount = perfConfig.run_count ?? 3; +const resultsDir = ensureResultsDir(); + +test.describe('performance budget journeys', () => { + if (!perfConfig.journeys || perfConfig.journeys.length === 0) { + test('no journeys defined', async () => { + test.skip(true, 'No journeys defined in perf budget'); + }); + return; + } + + for (const journey of perfConfig.journeys) { + test(journey.id, async ({ browser }) => { + const runs: Array<{ navigation_duration_ms: number; largest_contentful_paint_ms: number }> = []; + + for (let iteration = 0; iteration < runCount; iteration += 1) { + const metrics = await collectNavigationMetrics(browser, journey.from_url, journey.to_url, journey.waits); + runs.push({ + navigation_duration_ms: metrics.navigation, + largest_contentful_paint_ms: metrics.lcp, + }); + } + + const navigationValues = runs.map((run) => run.navigation_duration_ms).filter((value) => Number.isFinite(value)); + const lcpValues = runs.map((run) => run.largest_contentful_paint_ms).filter((value) => Number.isFinite(value)); + + const aggregated: JourneyResult['aggregated'] = {}; + + const metricConfigById = new Map(); + for (const metric of journey.metrics ?? []) { + metricConfigById.set(metric.id, metric); + } + + if (navigationValues.length) { + const metricConfig = metricConfigById.get('navigation-duration'); + const quantile = metricConfig ? toQuantile(metricConfig.aggregation) : 0.9; + const value = percentile(navigationValues, quantile ?? 0.9); + aggregated.navigation_duration = { + quantile: metricConfig ? metricConfig.aggregation.toLowerCase() : 'p90', + value_ms: value, + threshold_ms: metricConfig?.threshold, + passed: metricConfig ? value <= metricConfig.threshold : true, + }; + } + + if (lcpValues.length) { + const metricConfig = metricConfigById.get('largest-contentful-paint'); + const quantile = metricConfig ? toQuantile(metricConfig.aggregation) : 0.95; + const value = percentile(lcpValues, quantile ?? 0.95); + aggregated.largest_contentful_paint = { + quantile: metricConfig ? metricConfig.aggregation.toLowerCase() : 'p95', + value_ms: value, + threshold_ms: metricConfig?.threshold, + passed: metricConfig ? value <= metricConfig.threshold : true, + }; + } + + const result: JourneyResult = { + id: journey.id, + description: journey.description, + runs, + aggregated, + }; + + writeJourneyResult(resultsDir, result); + await attachJourneyResult(result); + + if (aggregated.navigation_duration && aggregated.navigation_duration.threshold_ms !== undefined) { + expect(aggregated.navigation_duration.value_ms, 'homepage to configurator navigation P90 should remain under budget').toBeLessThanOrEqual( + aggregated.navigation_duration.threshold_ms, + ); + } + + if (aggregated.largest_contentful_paint && aggregated.largest_contentful_paint.threshold_ms !== undefined) { + expect(aggregated.largest_contentful_paint.value_ms, 'homepage to configurator LCP P95 should remain under budget').toBeLessThanOrEqual( + aggregated.largest_contentful_paint.threshold_ms, + ); + } + }); + } +}); diff --git a/mkdocs.yml b/mkdocs.yml index 71ebe479..c585417e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -33,6 +33,7 @@ nav: - Getting Started: getting-started.md - Backend Development: backend-development.md - Frontend Development: frontend-development.md + - Release Checklist: release-checklist.md - Docker and Deployment: docker-deployment.md - Architecture: ARCHITECTURE.md - API Spec: API_SPEC.md diff --git a/scripts/ci/__init__.py b/scripts/ci/__init__.py new file mode 100644 index 00000000..aaa6e9b5 --- /dev/null +++ b/scripts/ci/__init__.py @@ -0,0 +1 @@ +"""CI helper scripts for observability automation.""" diff --git a/scripts/ci/check_canary_budgets.py b/scripts/ci/check_canary_budgets.py new file mode 100644 index 00000000..bbbfc72b --- /dev/null +++ b/scripts/ci/check_canary_budgets.py @@ -0,0 +1,201 @@ +"""Validate canary latency and error budgets using Prometheus and Tempo.""" + +from __future__ import annotations + +import json +import os +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class BudgetResult: + """Hold the evaluation result for a particular budget.""" + + name: str + current: float + previous: Optional[float] + threshold: float + unit: str + passed: bool + regression: Optional[float] = None + + def to_line(self) -> str: + previous_part = f", previous={self.previous:.4f}{self.unit}" if self.previous is not None else "" + regression_part = f", regression={self.regression:.2%}" if self.regression is not None else "" + status = "OK" if self.passed else "FAIL" + return ( + f"[{status}] {self.name}: current={self.current:.4f}{self.unit}{previous_part}" + f" (threshold={self.threshold:.4f}{self.unit}{regression_part})" + ) + + +def _env(name: str, default: Optional[str] = None) -> Optional[str]: + value = os.getenv(name) + return value if value not in (None, "") else default + + +def query_prometheus(base_url: str, query: str, timestamp: float) -> float: + """Execute an instant Prometheus query and return the first scalar value.""" + + encoded_query = urllib.parse.urlencode({"query": query, "time": f"{timestamp:.3f}"}) + url = f"{base_url.rstrip('/')}/api/v1/query?{encoded_query}" + try: + with urllib.request.urlopen(url, timeout=15) as response: + payload = json.loads(response.read().decode("utf-8")) + except urllib.error.URLError as exc: # pragma: no cover - network errors + raise SystemExit(f"Failed to query Prometheus at {url}: {exc}") + + if payload.get("status") != "success": + raise SystemExit(f"Prometheus query failed: {payload}") + + results = payload.get("data", {}).get("result", []) + if not results: + raise SystemExit(f"Prometheus query returned no data for {query}") + + value = results[0].get("value") + if not value or len(value) < 2: + raise SystemExit(f"Prometheus result malformed: {results[0]}") + + return float(value[1]) + + +def evaluate_budget( + *, + name: str, + base_url: str, + query: str, + unit_scale: float, + threshold: float, + regression_tolerance: float, + baseline_offset_seconds: float, + unit_label: str, +) -> BudgetResult: + now = time.time() + current_value_raw = query_prometheus(base_url, query, now) + previous_value_raw = query_prometheus(base_url, query, now - baseline_offset_seconds) + + current_value = current_value_raw * unit_scale + previous_value = previous_value_raw * unit_scale + + regression_ratio = (current_value - previous_value) / previous_value if previous_value else None + + within_threshold = current_value <= threshold + within_regression = ( + regression_ratio is None or regression_ratio <= regression_tolerance + ) + + return BudgetResult( + name=name, + current=current_value, + previous=previous_value, + threshold=threshold, + unit=unit_label, + passed=within_threshold and within_regression, + regression=regression_ratio, + ) + + +def query_tempo(base_url: str, query_json: str) -> dict: + data = query_json.encode("utf-8") + request = urllib.request.Request( + f"{base_url.rstrip('/')}/api/search", data=data, headers={"Content-Type": "application/json"} + ) + try: + with urllib.request.urlopen(request, timeout=15) as response: + return json.loads(response.read().decode("utf-8")) + except urllib.error.URLError as exc: # pragma: no cover - network errors + raise SystemExit(f"Failed to query Tempo: {exc}") + + +def check_tempo_regressions(base_url: str, query: str, duration_budget_ms: float) -> BudgetResult: + payload = query_tempo(base_url, query) + traces = payload.get("traces", []) + if not traces: + raise SystemExit("Tempo query returned no traces; cannot evaluate budget") + + durations_ms = [trace.get("durationMs") for trace in traces if isinstance(trace.get("durationMs"), (int, float))] + if not durations_ms: + raise SystemExit("Tempo query did not include trace durations") + + current = max(durations_ms) + return BudgetResult( + name="tempo-trace-duration", + current=current, + previous=None, + threshold=duration_budget_ms, + unit=" ms", + passed=current <= duration_budget_ms, + ) + + +def main() -> int: + prom_url = _env("PROMETHEUS_URL") + latency_query = _env("PROMETHEUS_LATENCY_QUERY") + error_query = _env("PROMETHEUS_ERROR_RATE_QUERY") + + if not prom_url or not latency_query or not error_query: + print("[canary] Prometheus configuration missing; skipping canary budget validation.") + return 0 + + latency_budget_ms = float(_env("LATENCY_BUDGET_MS", "3000")) + error_budget_rate = float(_env("ERROR_RATE_BUDGET", "0.02")) + regression_tolerance = float(_env("REGRESSION_TOLERANCE", "0.15")) + baseline_offset_seconds = float(_env("BASELINE_OFFSET_SECONDS", str(60 * 60))) + latency_unit_scale = float(_env("LATENCY_UNIT_SCALE", "1000")) + + results = [] + results.append( + evaluate_budget( + name="p95-latency", + base_url=prom_url, + query=latency_query, + unit_scale=latency_unit_scale, + threshold=latency_budget_ms, + regression_tolerance=regression_tolerance, + baseline_offset_seconds=baseline_offset_seconds, + unit_label=" ms", + ) + ) + + error_result = evaluate_budget( + name="error-rate", + base_url=prom_url, + query=error_query, + unit_scale=1.0, + threshold=error_budget_rate, + regression_tolerance=regression_tolerance, + baseline_offset_seconds=baseline_offset_seconds, + unit_label=" rate", + ) + results.append(error_result) + + tempo_url = _env("TEMPO_URL") + tempo_query = _env("TEMPO_QUERY") + tempo_budget_ms = _env("TEMPO_DURATION_BUDGET_MS") + if tempo_url and tempo_query and tempo_budget_ms: + tempo_result = check_tempo_regressions(tempo_url, tempo_query, float(tempo_budget_ms)) + results.append(tempo_result) + + failures = [result for result in results if not result.passed] + + for result in results: + print(result.to_line()) + + if failures: + print("Budget failures detected:") + for failure in failures: + print(f" - {failure.name}") + return 1 + + print("All canary budgets within thresholds.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/publish-perf-metrics.mjs b/scripts/publish-perf-metrics.mjs new file mode 100644 index 00000000..5b09f0f7 --- /dev/null +++ b/scripts/publish-perf-metrics.mjs @@ -0,0 +1,159 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; + +const RESULTS_DIR = process.env.PERF_RESULTS_DIR + ? path.resolve(process.cwd(), process.env.PERF_RESULTS_DIR) + : path.resolve(process.cwd(), 'perf-results'); + +const PUSHGATEWAY_URL = process.env.PUSHGATEWAY_URL ?? ''; +const PUSHGATEWAY_JOB = process.env.PUSHGATEWAY_JOB ?? 'ci-performance-budget'; +const GIT_SHA = process.env.GITHUB_SHA ?? ''; +const GIT_REF = process.env.GITHUB_REF ?? ''; + +const readJourneyFiles = async () => { + try { + const entries = await fs.readdir(RESULTS_DIR); + const jsonFiles = entries.filter((entry) => entry.endsWith('.json')); + const payloads = []; + for (const file of jsonFiles) { + const content = await fs.readFile(path.join(RESULTS_DIR, file), 'utf8'); + payloads.push(JSON.parse(content)); + } + return payloads; + } catch (error) { + if ((error instanceof Error && 'code' in error && error.code === 'ENOENT') || + (typeof error === 'object' && error !== null && 'code' in error && error.code === 'ENOENT')) { + return []; + } + throw error; + } +}; + +const formatPrometheusBody = (journey) => { + const labelsBase = `journey="${journey.id}"`; + const lines = [ + '# TYPE ci_navigation_duration_seconds summary', + ]; + + if (journey.aggregated?.navigation_duration?.value_ms !== undefined) { + const navigationSeconds = journey.aggregated.navigation_duration.value_ms / 1000; + const quantileLabel = journey.aggregated.navigation_duration.quantile; + lines.push(`ci_navigation_duration_seconds{${labelsBase},quantile="${quantileLabel}"} ${navigationSeconds}`); + } + + if (journey.aggregated?.largest_contentful_paint?.value_ms !== undefined) { + const lcpSeconds = journey.aggregated.largest_contentful_paint.value_ms / 1000; + const quantileLabel = journey.aggregated.largest_contentful_paint.quantile; + lines.push(`ci_lcp_seconds{${labelsBase},quantile="${quantileLabel}"} ${lcpSeconds}`); + } + + const metaLabels = [`journey="${journey.id}"`]; + if (GIT_SHA) { + metaLabels.push(`git_sha="${GIT_SHA}"`); + } + if (GIT_REF) { + metaLabels.push(`git_ref="${GIT_REF}"`); + } + + lines.push(`# TYPE ci_perf_build_info gauge`); + lines.push(`ci_perf_build_info{${metaLabels.join(',')}} 1`); + + return lines.join('\n'); +}; + +const pushToPushgateway = async (journeyMetrics) => { + const baseUrl = PUSHGATEWAY_URL.replace(/\/$/, ''); + for (const journey of journeyMetrics) { + const body = formatPrometheusBody(journey); + const targetUrl = `${baseUrl}/metrics/job/${encodeURIComponent(PUSHGATEWAY_JOB)}/journey/${encodeURIComponent(journey.id)}`; + const response = await fetch(targetUrl, { + method: 'POST', + headers: { 'Content-Type': 'text/plain' }, + body, + }); + if (!response.ok) { + const text = await response.text(); + throw new Error(`Failed to push metrics to Pushgateway (${response.status}): ${text}`); + } + } +}; + +const toJUnit = (journeyMetrics) => { + const testcases = journeyMetrics.map((journey) => { + const name = journey.description ? `${journey.id} — ${journey.description}` : journey.id; + const lines = []; + if (journey.aggregated?.navigation_duration) { + const metric = journey.aggregated.navigation_duration; + lines.push(`navigation_${metric.quantile}=${metric.value_ms.toFixed(2)}ms`); + if (metric.threshold_ms !== undefined) { + lines.push(`navigation_threshold=${metric.threshold_ms}`); + } + lines.push(`navigation_passed=${metric.passed}`); + } + if (journey.aggregated?.largest_contentful_paint) { + const metric = journey.aggregated.largest_contentful_paint; + lines.push(`lcp_${metric.quantile}=${metric.value_ms.toFixed(2)}ms`); + if (metric.threshold_ms !== undefined) { + lines.push(`lcp_threshold=${metric.threshold_ms}`); + } + lines.push(`lcp_passed=${metric.passed}`); + } + const systemOut = lines.join('\n'); + const failure = Object.values(journey.aggregated ?? {}).some((metric) => metric && metric.threshold_ms !== undefined && !metric.passed); + return { + name, + systemOut, + failure, + failureMessage: failure ? 'Performance budget regression detected' : undefined, + }; + }); + + const failures = testcases.filter((tc) => tc.failure).length; + const xmlParts = [ + '', + ``, + ``, + ]; + + for (const testcase of testcases) { + xmlParts.push(``); + if (testcase.failure && testcase.failureMessage) { + xmlParts.push(``); + } + if (testcase.systemOut) { + xmlParts.push(``); + } + xmlParts.push(''); + } + + xmlParts.push(''); + xmlParts.push(''); + return xmlParts.join(''); +}; + +const writeJUnitReport = async (journeyMetrics) => { + const xml = toJUnit(journeyMetrics); + await fs.writeFile(path.join(RESULTS_DIR, 'perf-budget.junit.xml'), xml, 'utf8'); +}; + +const main = async () => { + const journeyMetrics = await readJourneyFiles(); + if (!journeyMetrics.length) { + console.log(`No performance results found in ${RESULTS_DIR}; skipping publish step.`); + return; + } + + if (PUSHGATEWAY_URL) { + await pushToPushgateway(journeyMetrics); + console.log(`Published ${journeyMetrics.length} journey metrics to Pushgateway.`); + } else { + await writeJUnitReport(journeyMetrics); + console.log(`Wrote JUnit report for ${journeyMetrics.length} journeys to ${path.join(RESULTS_DIR, 'perf-budget.junit.xml')}.`); + } +}; + +main().catch((error) => { + console.error('[publish-perf-metrics] Failed to publish metrics:', error); + process.exitCode = 1; +});