From 3db187cae2b2e73a4518396ce4cf41567499b89c Mon Sep 17 00:00:00 2001 From: Wilson Li Date: Wed, 3 Jun 2026 09:14:19 -0700 Subject: [PATCH] fix(rca): normalize confidence score scale (no more "9000%") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The synthesis LLM is inconsistent about the confidenceScore scale — some completions emit a 0–1 fraction (0.9), others a 0–100 value (90 / 95.00). It was stored unnormalized, so every consumer that multiplies by 100 rendered nonsense for the 0–100 case: the investigation header showed "9000%", the report card "(95.00)", etc. The low-confidence styling (`< 0.5`) also silently never fired for 0–100 scores. Fix: - New `src/lib/confidence.ts` — `confidenceFraction` (coerce either scale to a clamped 0–1) + `confidencePercent` (0–100 int). Shared web/server. - Normalize at the source (synthesis step) so new reports persist a 0–1 fraction — also keeps the runner's low-confidence gate (which assumes 0–1) correct. - Use the helpers defensively at every display site (RcaReport header + the low-confidence banner/styling, InvestigationPane metadata, ChatPane card, markdown export, Slack notifier, runner event log) so reports already persisted on the wrong scale render correctly too. Found during orchestrator browser-QA; unrelated to the orchestrator, so shipped off main. tsc clean, full suite 2409 (+ confidence unit tests). --- src/lib/confidence.test.ts | 35 ++++++++++++++++++++++++ src/lib/confidence.ts | 27 ++++++++++++++++++ src/server/investigation-runner.ts | 3 +- src/server/slack-notifier.ts | 3 +- src/web/components/ChatPane.tsx | 3 +- src/web/components/InvestigationPane.tsx | 3 +- src/web/components/RcaReport.tsx | 11 ++++++-- src/web/lib/formatRcaMarkdown.ts | 3 +- src/workflows/steps/synthesis.ts | 6 +++- 9 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 src/lib/confidence.test.ts create mode 100644 src/lib/confidence.ts diff --git a/src/lib/confidence.test.ts b/src/lib/confidence.test.ts new file mode 100644 index 00000000..19f13d8a --- /dev/null +++ b/src/lib/confidence.test.ts @@ -0,0 +1,35 @@ +import { describe, it, expect } from "vitest"; +import { confidenceFraction, confidencePercent } from "./confidence.js"; + +describe("confidenceFraction", () => { + it("passes 0–1 fractions through", () => { + expect(confidenceFraction(0.9)).toBeCloseTo(0.9); + expect(confidenceFraction(0.42)).toBeCloseTo(0.42); + expect(confidenceFraction(1)).toBe(1); + expect(confidenceFraction(0)).toBe(0); + }); + + it("rescales 0–100 values to a fraction", () => { + expect(confidenceFraction(90)).toBeCloseTo(0.9); + expect(confidenceFraction(95)).toBeCloseTo(0.95); + expect(confidenceFraction(95.0)).toBeCloseTo(0.95); + }); + + it("clamps to [0, 1] and handles bad input", () => { + expect(confidenceFraction(9000)).toBe(1); // pathological — clamp, don't emit 90x + expect(confidenceFraction(-5)).toBe(0); + expect(confidenceFraction(null)).toBe(0); + expect(confidenceFraction(undefined)).toBe(0); + expect(confidenceFraction(NaN)).toBe(0); + }); +}); + +describe("confidencePercent", () => { + it("renders both scales as the same percentage (no more 9000%)", () => { + expect(confidencePercent(0.9)).toBe(90); + expect(confidencePercent(90)).toBe(90); // the bug: was 9000 + expect(confidencePercent(0.95)).toBe(95); + expect(confidencePercent(95)).toBe(95); + expect(confidencePercent(null)).toBe(0); + }); +}); diff --git a/src/lib/confidence.ts b/src/lib/confidence.ts new file mode 100644 index 00000000..80486c80 --- /dev/null +++ b/src/lib/confidence.ts @@ -0,0 +1,27 @@ +/** + * Confidence-score scale normalization. + * + * `confidenceScore` is meant to be a 0–1 fraction, but the synthesis LLM is + * inconsistent: some completions emit `0.9`, others `90` (or `95.00`) on a + * 0–100 scale. Stored unnormalized, the 0–100 values render as nonsense once a + * consumer multiplies by 100 (e.g. `90 * 100 = 9000%`). + * + * These helpers coerce either scale to a single canonical form. Normalize at + * the source so new reports store a 0–1 fraction, and use these defensively at + * display sites so reports already persisted on the wrong scale still render + * sensibly. + */ + +/** Coerce a confidence score (0–1 fraction OR 0–100 percentage) to a 0–1 + * fraction, clamped to [0, 1]. `null`/`undefined`/`NaN` → 0. A value `> 1` is + * assumed to be on the 0–100 scale and divided by 100. */ +export function confidenceFraction(raw: number | null | undefined): number { + if (raw == null || Number.isNaN(raw)) return 0; + const fraction = raw > 1 ? raw / 100 : raw; + return Math.max(0, Math.min(1, fraction)); +} + +/** Whole-number percentage (0–100) for display, from either input scale. */ +export function confidencePercent(raw: number | null | undefined): number { + return Math.round(confidenceFraction(raw) * 100); +} diff --git a/src/server/investigation-runner.ts b/src/server/investigation-runner.ts index 9bb0bd11..18988005 100644 --- a/src/server/investigation-runner.ts +++ b/src/server/investigation-runner.ts @@ -14,6 +14,7 @@ import { ulid } from "ulid"; import { createLogger } from "../logger.js"; import type { Database } from "./db.js"; +import { confidencePercent } from "../lib/confidence.js"; import type { IInvestigationAgent } from "../types/agent-interfaces.js"; import type { RcaReport } from "../types/rca-types.js"; import type { ServiceConfig, InvestigationTemplate } from "../config/schema.js"; @@ -321,7 +322,7 @@ export class InvestigationRunner { total_output_tokens: totalTokens.outputTokens, total_duration_ms: totalDurationMs, }); - const confidencePct = report.confidenceScore != null ? Math.round(report.confidenceScore * 100) : null; + const confidencePct = report.confidenceScore != null ? confidencePercent(report.confidenceScore) : null; eventLog.append({ kind: "investigation_completed", severity: "success", diff --git a/src/server/slack-notifier.ts b/src/server/slack-notifier.ts index 842b4a88..be3521a1 100644 --- a/src/server/slack-notifier.ts +++ b/src/server/slack-notifier.ts @@ -5,6 +5,7 @@ import { createLogger } from "../logger.js"; import type { RcaReport } from "../types/rca-types.js"; +import { confidencePercent } from "../lib/confidence.js"; const logger = createLogger(); @@ -79,7 +80,7 @@ export async function notifySlack( const severity = report.severity ?? "unknown"; const confidence = report.confidenceScore != null - ? `${Math.round(report.confidenceScore * 100)}%` + ? `${confidencePercent(report.confidenceScore)}%` : "N/A"; const rootCause = report.rootCause ?? "Unable to determine"; const summary = report.summary ?? ""; diff --git a/src/web/components/ChatPane.tsx b/src/web/components/ChatPane.tsx index 8b044289..384c42ba 100644 --- a/src/web/components/ChatPane.tsx +++ b/src/web/components/ChatPane.tsx @@ -6,6 +6,7 @@ import { Button } from "@/components/ui/button"; import { Search, SearchCode, MessageSquare, Plus, FileText, ChevronRight, ChevronDown, Send, Trash2, X, ArrowRight, Zap } from "lucide-react"; import { renderInline } from "../lib/renderInline"; import { renderMarkdown } from "../lib/renderMarkdown"; +import { confidenceFraction } from "../../lib/confidence.js"; import { formatTokens } from "../lib/formatTokens.js"; import { formatTimestamp } from "../lib/formatTimestamp"; import { MetricChart, type TimeSeriesData } from "./MetricChart"; @@ -729,7 +730,7 @@ export function ChatPane({ ws, onInvestigationStarted, onViewInvestigation, acti {msg.report.severity} - {msg.report.confidence}{msg.report.confidenceScore != null ? ` (${msg.report.confidenceScore.toFixed(2)})` : ""} + {msg.report.confidence}{msg.report.confidenceScore != null ? ` (${confidenceFraction(msg.report.confidenceScore).toFixed(2)})` : ""} {msg.report.summary && ( diff --git a/src/web/components/InvestigationPane.tsx b/src/web/components/InvestigationPane.tsx index 2e099865..9bb17e26 100644 --- a/src/web/components/InvestigationPane.tsx +++ b/src/web/components/InvestigationPane.tsx @@ -23,6 +23,7 @@ import type { RcaReport as RcaReportType } from "../../types/rca-types.js"; import { formatTokens } from "../lib/formatTokens.js"; import { buildPhaseActions } from "../lib/grafana-links.js"; import { downloadMarkdown, downloadPng, copyMarkdown } from "../lib/exportInvestigation.js"; +import { confidencePercent } from "../../lib/confidence.js"; const DEFAULT_PHASES: PhaseState[] = [ { name: "planning", label: "Planning", status: "pending" }, @@ -706,7 +707,7 @@ export function InvestigationPane({ {(report as any)?.confidence && ( )} {(report as any)?.severity && ( diff --git a/src/web/components/RcaReport.tsx b/src/web/components/RcaReport.tsx index a495e147..f26352b5 100644 --- a/src/web/components/RcaReport.tsx +++ b/src/web/components/RcaReport.tsx @@ -2,6 +2,7 @@ import { Badge } from "@/components/ui/badge"; import { type ReactNode } from "react"; import { FileText } from "lucide-react"; import { renderInline } from "../lib/renderInline"; +import { confidenceFraction } from "../../lib/confidence.js"; interface RcaReportData { rootCause: string; @@ -98,6 +99,10 @@ function Section({ label, count, children }: { label: string; count?: number; ch export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReportData; hideOldDashboardLinks?: boolean }) { + // Normalize to a 0–1 fraction for display/styling — stored scores may be 0–1 + // or 0–100 depending on the synthesis completion (see lib/confidence). + const confFrac = report.confidenceScore != null ? confidenceFraction(report.confidenceScore) : null; + const severityGlow = report.severity === "critical" ? "glow-red border-destructive/30" : report.severity === "high" ? "glow-coral border-accent/25" : @@ -118,7 +123,7 @@ export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReport {report.severity} - {report.confidence}{report.confidenceScore != null ? ` (${report.confidenceScore.toFixed(2)})` : ""} confidence + {report.confidence}{confFrac != null ? ` (${confFrac.toFixed(2)})` : ""} confidence @@ -142,7 +147,7 @@ export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReport )} {/* Low confidence banner */} - {report.confidenceScore != null && report.confidenceScore > 0 && report.confidenceScore < 0.5 && ( + {confFrac != null && confFrac > 0 && confFrac < 0.5 && (
Low confidence — insufficient data to determine root cause @@ -155,7 +160,7 @@ export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReport
Root Cause -

{renderInline(report.rootCause)}

+

{renderInline(report.rootCause)}

diff --git a/src/web/lib/formatRcaMarkdown.ts b/src/web/lib/formatRcaMarkdown.ts index d66fa139..99ac7c01 100644 --- a/src/web/lib/formatRcaMarkdown.ts +++ b/src/web/lib/formatRcaMarkdown.ts @@ -4,13 +4,14 @@ */ import type { RcaReport } from "../../types/rca-types.js"; +import { confidencePercent } from "../../lib/confidence.js"; export function formatRcaMarkdown(report: RcaReport): string { const lines: string[] = []; lines.push(`# RCA Report: ${report.service}`); lines.push(""); - lines.push(`**Severity:** ${report.severity} | **Confidence:** ${report.confidence} (${Math.round(report.confidenceScore * 100)}%)`); + lines.push(`**Severity:** ${report.severity} | **Confidence:** ${report.confidence} (${confidencePercent(report.confidenceScore)}%)`); lines.push(`**Investigated:** ${report.investigatedAt}`); if (report.timeRange) { lines.push(`**Investigation window:** ${report.timeRange.from} → ${report.timeRange.to}`); diff --git a/src/workflows/steps/synthesis.ts b/src/workflows/steps/synthesis.ts index b1006f1e..4977eb8b 100644 --- a/src/workflows/steps/synthesis.ts +++ b/src/workflows/steps/synthesis.ts @@ -15,6 +15,7 @@ import { safeJsonParse } from "../../agents/shared/processors.js"; import { createSynthesisAgent } from "../../agents/synthesis.js"; import { wrapUntrusted } from "../../agents/shared/prompt-helpers.js"; import { formatPatterns } from "../../agents/shared/patterns.js"; +import { confidenceFraction } from "../../lib/confidence.js"; import { withLlmRetry, safeAgentRetryConfig } from "../../agents/shared/llm-retry.js"; import { LlmUnavailableError } from "../../agents/shared/llm-errors.js"; import { RankedHypothesisSchema } from "../schemas.js"; @@ -241,7 +242,10 @@ export function buildSynthesisStep(config: WorkflowConfig) { dashboardLinks = synthesisParsed.dashboardLinks ?? dashboardLinks; recommendedActions = synthesisParsed.recommendedActions ?? recommendedActions; confidence = synthesisParsed.confidence ?? confidence; - confidenceScore = synthesisParsed.confidenceScore ?? confidenceScore; + // Normalize to a 0–1 fraction at the source — the LLM is inconsistent + // (some completions emit 0.9, others 90), and every downstream consumer + // (display ×100, the low-confidence gate) assumes 0–1. + confidenceScore = confidenceFraction(synthesisParsed.confidenceScore ?? confidenceScore); } // Deterministic severity validation