From 3db187cae2b2e73a4518396ce4cf41567499b89c Mon Sep 17 00:00:00 2001
From: Wilson Li <lwz812@gmail.com>
Date: Wed, 3 Jun 2026 09:14:19 -0700
Subject: [PATCH] fix(rca): normalize confidence score scale (no more "9000%")
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The synthesis LLM is inconsistent about the confidenceScore scale — some
completions emit a 0–1 fraction (0.9), others a 0–100 value (90 / 95.00).
It was stored unnormalized, so every consumer that multiplies by 100 rendered
nonsense for the 0–100 case: the investigation header showed "9000%", the
report card "(95.00)", etc. The low-confidence styling (`< 0.5`) also silently
never fired for 0–100 scores.

Fix:
- New `src/lib/confidence.ts` — `confidenceFraction` (coerce either scale to a
  clamped 0–1) + `confidencePercent` (0–100 int). Shared web/server.
- Normalize at the source (synthesis step) so new reports persist a 0–1
  fraction — also keeps the runner's low-confidence gate (which assumes 0–1)
  correct.
- Use the helpers defensively at every display site (RcaReport header + the
  low-confidence banner/styling, InvestigationPane metadata, ChatPane card,
  markdown export, Slack notifier, runner event log) so reports already
  persisted on the wrong scale render correctly too.

Found during orchestrator browser-QA; unrelated to the orchestrator, so shipped
off main. tsc clean, full suite 2409 (+ confidence unit tests).
---
 src/lib/confidence.test.ts               | 35 ++++++++++++++++++++++++
 src/lib/confidence.ts                    | 27 ++++++++++++++++++
 src/server/investigation-runner.ts       |  3 +-
 src/server/slack-notifier.ts             |  3 +-
 src/web/components/ChatPane.tsx          |  3 +-
 src/web/components/InvestigationPane.tsx |  3 +-
 src/web/components/RcaReport.tsx         | 11 ++++++--
 src/web/lib/formatRcaMarkdown.ts         |  3 +-
 src/workflows/steps/synthesis.ts         |  6 +++-
 9 files changed, 85 insertions(+), 9 deletions(-)
 create mode 100644 src/lib/confidence.test.ts
 create mode 100644 src/lib/confidence.ts

diff --git a/src/lib/confidence.test.ts b/src/lib/confidence.test.ts
new file mode 100644
index 00000000..19f13d8a
--- /dev/null
+++ b/src/lib/confidence.test.ts
@@ -0,0 +1,35 @@
+import { describe, it, expect } from "vitest";
+import { confidenceFraction, confidencePercent } from "./confidence.js";
+
+describe("confidenceFraction", () => {
+  it("passes 0–1 fractions through", () => {
+    expect(confidenceFraction(0.9)).toBeCloseTo(0.9);
+    expect(confidenceFraction(0.42)).toBeCloseTo(0.42);
+    expect(confidenceFraction(1)).toBe(1);
+    expect(confidenceFraction(0)).toBe(0);
+  });
+
+  it("rescales 0–100 values to a fraction", () => {
+    expect(confidenceFraction(90)).toBeCloseTo(0.9);
+    expect(confidenceFraction(95)).toBeCloseTo(0.95);
+    expect(confidenceFraction(95.0)).toBeCloseTo(0.95);
+  });
+
+  it("clamps to [0, 1] and handles bad input", () => {
+    expect(confidenceFraction(9000)).toBe(1); // pathological — clamp, don't emit 90x
+    expect(confidenceFraction(-5)).toBe(0);
+    expect(confidenceFraction(null)).toBe(0);
+    expect(confidenceFraction(undefined)).toBe(0);
+    expect(confidenceFraction(NaN)).toBe(0);
+  });
+});
+
+describe("confidencePercent", () => {
+  it("renders both scales as the same percentage (no more 9000%)", () => {
+    expect(confidencePercent(0.9)).toBe(90);
+    expect(confidencePercent(90)).toBe(90); // the bug: was 9000
+    expect(confidencePercent(0.95)).toBe(95);
+    expect(confidencePercent(95)).toBe(95);
+    expect(confidencePercent(null)).toBe(0);
+  });
+});
diff --git a/src/lib/confidence.ts b/src/lib/confidence.ts
new file mode 100644
index 00000000..80486c80
--- /dev/null
+++ b/src/lib/confidence.ts
@@ -0,0 +1,27 @@
+/**
+ * Confidence-score scale normalization.
+ *
+ * `confidenceScore` is meant to be a 0–1 fraction, but the synthesis LLM is
+ * inconsistent: some completions emit `0.9`, others `90` (or `95.00`) on a
+ * 0–100 scale. Stored unnormalized, the 0–100 values render as nonsense once a
+ * consumer multiplies by 100 (e.g. `90 * 100 = 9000%`).
+ *
+ * These helpers coerce either scale to a single canonical form. Normalize at
+ * the source so new reports store a 0–1 fraction, and use these defensively at
+ * display sites so reports already persisted on the wrong scale still render
+ * sensibly.
+ */
+
+/** Coerce a confidence score (0–1 fraction OR 0–100 percentage) to a 0–1
+ *  fraction, clamped to [0, 1]. `null`/`undefined`/`NaN` → 0. A value `> 1` is
+ *  assumed to be on the 0–100 scale and divided by 100. */
+export function confidenceFraction(raw: number | null | undefined): number {
+  if (raw == null || Number.isNaN(raw)) return 0;
+  const fraction = raw > 1 ? raw / 100 : raw;
+  return Math.max(0, Math.min(1, fraction));
+}
+
+/** Whole-number percentage (0–100) for display, from either input scale. */
+export function confidencePercent(raw: number | null | undefined): number {
+  return Math.round(confidenceFraction(raw) * 100);
+}
diff --git a/src/server/investigation-runner.ts b/src/server/investigation-runner.ts
index 9bb0bd11..18988005 100644
--- a/src/server/investigation-runner.ts
+++ b/src/server/investigation-runner.ts
@@ -14,6 +14,7 @@
 import { ulid } from "ulid";
 import { createLogger } from "../logger.js";
 import type { Database } from "./db.js";
+import { confidencePercent } from "../lib/confidence.js";
 import type { IInvestigationAgent } from "../types/agent-interfaces.js";
 import type { RcaReport } from "../types/rca-types.js";
 import type { ServiceConfig, InvestigationTemplate } from "../config/schema.js";
@@ -321,7 +322,7 @@ export class InvestigationRunner {
         total_output_tokens: totalTokens.outputTokens,
         total_duration_ms: totalDurationMs,
       });
-      const confidencePct = report.confidenceScore != null ? Math.round(report.confidenceScore * 100) : null;
+      const confidencePct = report.confidenceScore != null ? confidencePercent(report.confidenceScore) : null;
       eventLog.append({
         kind: "investigation_completed",
         severity: "success",
diff --git a/src/server/slack-notifier.ts b/src/server/slack-notifier.ts
index 842b4a88..be3521a1 100644
--- a/src/server/slack-notifier.ts
+++ b/src/server/slack-notifier.ts
@@ -5,6 +5,7 @@
 
 import { createLogger } from "../logger.js";
 import type { RcaReport } from "../types/rca-types.js";
+import { confidencePercent } from "../lib/confidence.js";
 
 const logger = createLogger();
 
@@ -79,7 +80,7 @@ export async function notifySlack(
 
   const severity = report.severity ?? "unknown";
   const confidence = report.confidenceScore != null
-    ? `${Math.round(report.confidenceScore * 100)}%`
+    ? `${confidencePercent(report.confidenceScore)}%`
     : "N/A";
   const rootCause = report.rootCause ?? "Unable to determine";
   const summary = report.summary ?? "";
diff --git a/src/web/components/ChatPane.tsx b/src/web/components/ChatPane.tsx
index 8b044289..384c42ba 100644
--- a/src/web/components/ChatPane.tsx
+++ b/src/web/components/ChatPane.tsx
@@ -6,6 +6,7 @@ import { Button } from "@/components/ui/button";
 import { Search, SearchCode, MessageSquare, Plus, FileText, ChevronRight, ChevronDown, Send, Trash2, X, ArrowRight, Zap } from "lucide-react";
 import { renderInline } from "../lib/renderInline";
 import { renderMarkdown } from "../lib/renderMarkdown";
+import { confidenceFraction } from "../../lib/confidence.js";
 import { formatTokens } from "../lib/formatTokens.js";
 import { formatTimestamp } from "../lib/formatTimestamp";
 import { MetricChart, type TimeSeriesData } from "./MetricChart";
@@ -729,7 +730,7 @@ export function ChatPane({ ws, onInvestigationStarted, onViewInvestigation, acti
                         <Badge variant={msg.report.severity === "critical" ? "destructive" : "secondary"} className="text-[8px] uppercase tracking-[0.1em]">
                           {msg.report.severity}
                         </Badge>
-                        <span className="text-[8px] font-mono text-muted-foreground/70">{msg.report.confidence}{msg.report.confidenceScore != null ? ` (${msg.report.confidenceScore.toFixed(2)})` : ""}</span>
+                        <span className="text-[8px] font-mono text-muted-foreground/70">{msg.report.confidence}{msg.report.confidenceScore != null ? ` (${confidenceFraction(msg.report.confidenceScore).toFixed(2)})` : ""}</span>
                       </div>
                     </div>
                     {msg.report.summary && (
diff --git a/src/web/components/InvestigationPane.tsx b/src/web/components/InvestigationPane.tsx
index 2e099865..9bb17e26 100644
--- a/src/web/components/InvestigationPane.tsx
+++ b/src/web/components/InvestigationPane.tsx
@@ -23,6 +23,7 @@ import type { RcaReport as RcaReportType } from "../../types/rca-types.js";
 import { formatTokens } from "../lib/formatTokens.js";
 import { buildPhaseActions } from "../lib/grafana-links.js";
 import { downloadMarkdown, downloadPng, copyMarkdown } from "../lib/exportInvestigation.js";
+import { confidencePercent } from "../../lib/confidence.js";
 
 const DEFAULT_PHASES: PhaseState[] = [
   { name: "planning", label: "Planning", status: "pending" },
@@ -706,7 +707,7 @@ export function InvestigationPane({
                   {(report as any)?.confidence && (
                     <MetaRow
                       label="confidence"
-                      value={`${String((report as any).confidence).toUpperCase()}${(report as any).confidenceScore ? ` · ${Math.round((report as any).confidenceScore * 100)}%` : ""}`}
+                      value={`${String((report as any).confidence).toUpperCase()}${(report as any).confidenceScore ? ` · ${confidencePercent((report as any).confidenceScore)}%` : ""}`}
                     />
                   )}
                   {(report as any)?.severity && (
diff --git a/src/web/components/RcaReport.tsx b/src/web/components/RcaReport.tsx
index a495e147..f26352b5 100644
--- a/src/web/components/RcaReport.tsx
+++ b/src/web/components/RcaReport.tsx
@@ -2,6 +2,7 @@ import { Badge } from "@/components/ui/badge";
 import { type ReactNode } from "react";
 import { FileText } from "lucide-react";
 import { renderInline } from "../lib/renderInline";
+import { confidenceFraction } from "../../lib/confidence.js";
 
 interface RcaReportData {
   rootCause: string;
@@ -98,6 +99,10 @@ function Section({ label, count, children }: { label: string; count?: number; ch
 
 export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReportData; hideOldDashboardLinks?: boolean }) {
 
+  // Normalize to a 0–1 fraction for display/styling — stored scores may be 0–1
+  // or 0–100 depending on the synthesis completion (see lib/confidence).
+  const confFrac = report.confidenceScore != null ? confidenceFraction(report.confidenceScore) : null;
+
   const severityGlow =
     report.severity === "critical" ? "glow-red border-destructive/30" :
     report.severity === "high" ? "glow-coral border-accent/25" :
@@ -118,7 +123,7 @@ export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReport
               {report.severity}
             </Badge>
             <span className="text-[9px] font-mono text-muted-foreground">
-              {report.confidence}{report.confidenceScore != null ? ` (${report.confidenceScore.toFixed(2)})` : ""} confidence
+              {report.confidence}{confFrac != null ? ` (${confFrac.toFixed(2)})` : ""} confidence
             </span>
           </div>
         </div>
@@ -142,7 +147,7 @@ export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReport
         </div>
       )}
       {/* Low confidence banner */}
-      {report.confidenceScore != null && report.confidenceScore > 0 && report.confidenceScore < 0.5 && (
+      {confFrac != null && confFrac > 0 && confFrac < 0.5 && (
         <div className="px-5 py-2.5 bg-warning/8 border-b border-warning/15 flex items-center gap-2">
           <span className="text-warning text-sm">⚠</span>
           <span className="text-[11px] font-body text-warning/80">Low confidence — insufficient data to determine root cause</span>
@@ -155,7 +160,7 @@ export function RcaReport({ report, hideOldDashboardLinks }: { report: RcaReport
         <div className="space-y-4">
           <div>
             <SectionLabel color="text-primary">Root Cause</SectionLabel>
-            <p className={`text-[13px] font-body leading-relaxed ${report.confidenceScore != null && report.confidenceScore < 0.5 ? "text-foreground/50 italic" : "text-foreground/90"}`}>{renderInline(report.rootCause)}</p>
+            <p className={`text-[13px] font-body leading-relaxed ${confFrac != null && confFrac < 0.5 ? "text-foreground/50 italic" : "text-foreground/90"}`}>{renderInline(report.rootCause)}</p>
           </div>
 
           <div>
diff --git a/src/web/lib/formatRcaMarkdown.ts b/src/web/lib/formatRcaMarkdown.ts
index d66fa139..99ac7c01 100644
--- a/src/web/lib/formatRcaMarkdown.ts
+++ b/src/web/lib/formatRcaMarkdown.ts
@@ -4,13 +4,14 @@
  */
 
 import type { RcaReport } from "../../types/rca-types.js";
+import { confidencePercent } from "../../lib/confidence.js";
 
 export function formatRcaMarkdown(report: RcaReport): string {
   const lines: string[] = [];
 
   lines.push(`# RCA Report: ${report.service}`);
   lines.push("");
-  lines.push(`**Severity:** ${report.severity} | **Confidence:** ${report.confidence} (${Math.round(report.confidenceScore * 100)}%)`);
+  lines.push(`**Severity:** ${report.severity} | **Confidence:** ${report.confidence} (${confidencePercent(report.confidenceScore)}%)`);
   lines.push(`**Investigated:** ${report.investigatedAt}`);
   if (report.timeRange) {
     lines.push(`**Investigation window:** ${report.timeRange.from} → ${report.timeRange.to}`);
diff --git a/src/workflows/steps/synthesis.ts b/src/workflows/steps/synthesis.ts
index b1006f1e..4977eb8b 100644
--- a/src/workflows/steps/synthesis.ts
+++ b/src/workflows/steps/synthesis.ts
@@ -15,6 +15,7 @@ import { safeJsonParse } from "../../agents/shared/processors.js";
 import { createSynthesisAgent } from "../../agents/synthesis.js";
 import { wrapUntrusted } from "../../agents/shared/prompt-helpers.js";
 import { formatPatterns } from "../../agents/shared/patterns.js";
+import { confidenceFraction } from "../../lib/confidence.js";
 import { withLlmRetry, safeAgentRetryConfig } from "../../agents/shared/llm-retry.js";
 import { LlmUnavailableError } from "../../agents/shared/llm-errors.js";
 import { RankedHypothesisSchema } from "../schemas.js";
@@ -241,7 +242,10 @@ export function buildSynthesisStep(config: WorkflowConfig) {
         dashboardLinks = synthesisParsed.dashboardLinks ?? dashboardLinks;
         recommendedActions = synthesisParsed.recommendedActions ?? recommendedActions;
         confidence = synthesisParsed.confidence ?? confidence;
-        confidenceScore = synthesisParsed.confidenceScore ?? confidenceScore;
+        // Normalize to a 0–1 fraction at the source — the LLM is inconsistent
+        // (some completions emit 0.9, others 90), and every downstream consumer
+        // (display ×100, the low-confidence gate) assumes 0–1.
+        confidenceScore = confidenceFraction(synthesisParsed.confidenceScore ?? confidenceScore);
       }
 
       // Deterministic severity validation