diff --git a/mcpjam-inspector/client/src/components/evals/ci-suite-detail.tsx b/mcpjam-inspector/client/src/components/evals/ci-suite-detail.tsx index 1a9e49d8b..1a604919a 100644 --- a/mcpjam-inspector/client/src/components/evals/ci-suite-detail.tsx +++ b/mcpjam-inspector/client/src/components/evals/ci-suite-detail.tsx @@ -1,8 +1,8 @@ import { useEffect, useMemo, useState } from "react"; import { SuiteHeader } from "./suite-header"; -import { RunOverview } from "./run-overview"; +import { SuiteHeroStats } from "./suite-hero-stats"; +import { RunAccordionView } from "./run-accordion-view"; import { RunDetailView } from "./run-detail-view"; -import { TestCasesOverview } from "./test-cases-overview"; import { TestCaseDetailView } from "./test-case-detail-view"; import { useSuiteData, useRunDetailData } from "./use-suite-data"; import type { @@ -66,15 +66,10 @@ export function CiSuiteDetail({ : route.type === "test-detail" ? "test-detail" : "overview"; - const runsViewMode = - route.type === "suite-overview" && route.view === "test-cases" - ? "test-cases" - : "runs"; - const [showRunSummarySidebar, setShowRunSummarySidebar] = useState(false); const [runDetailSortBy, setRunDetailSortBy] = useState< "model" | "test" | "result" - >("test"); + >("result"); const { runTrendData, modelStats } = useSuiteData( suite, @@ -158,7 +153,7 @@ export function CiSuiteDetail({ navigateToCiEvalsRoute({ type: "suite-overview", suiteId: suite._id, - view: runsViewMode, + view: "test-cases", }); }; @@ -186,7 +181,7 @@ export function CiSuiteDetail({ deletingRunId={deletingRunId} showRunSummarySidebar={showRunSummarySidebar} setShowRunSummarySidebar={setShowRunSummarySidebar} - runsViewMode={runsViewMode} + runsViewMode={"test-cases"} runs={runs} allIterations={allIterations} aggregate={aggregate} @@ -214,6 +209,13 @@ export function CiSuiteDetail({ iterations={caseIterations} runs={runs} serverNames={connectedSuiteServers} + suiteName={suite.name} + onNavigateToSuite={() => { + navigateToCiEvalsRoute({ + type: "suite-overview", + suiteId: suite._id, + }); + }} onBack={() => { navigateToCiEvalsRoute({ type: "suite-overview", @@ -232,54 +234,30 @@ export function CiSuiteDetail({ ); })() ) : viewMode === "overview" ? ( -
- {runsViewMode === "runs" ? ( - { - navigateToCiEvalsRoute({ - type: "suite-overview", - suiteId: suite._id, - view: value, - }); - }} - userMap={userMap} - /> - ) : ( - { - navigateToCiEvalsRoute({ - type: "suite-overview", - suiteId: suite._id, - view: value, - }); - }} - onTestCaseClick={(testCaseId) => { - navigateToCiEvalsRoute({ - type: "test-detail", - suiteId: suite._id, - testId: testCaseId, - }); - }} - runTrendData={runTrendData} - modelStats={modelStats} - runsLoading={runsLoading} - onRunClick={handleRunClick} - /> - )} +
+ + { + navigateToCiEvalsRoute({ + type: "test-detail", + suiteId: suite._id, + testId: testCaseId, + }); + }} + userMap={userMap} + />
) : viewMode === "run-detail" && selectedRunDetails ? ( void; + onTestCaseClick?: (testCaseId: string) => void; + userMap?: Map; +} + +interface RunTestCase { + testCaseId: string; + title: string; + result: "passed" | "failed" | "pending" | "cancelled"; + duration: number; + model?: string; +} + +export function RunAccordionView({ + suite, + runs, + allIterations, + onRunClick, + onTestCaseClick, + userMap, +}: RunAccordionViewProps) { + // Sort runs by time (latest first) + const sortedRuns = useMemo( + () => + [...runs] + .filter((r) => r.isActive !== false) + .sort((a, b) => { + const aTime = a.completedAt ?? a.createdAt ?? 0; + const bTime = b.completedAt ?? b.createdAt ?? 0; + return bTime - aTime; + }), + [runs], + ); + + // Start with only the latest run expanded + const [expandedRunIds, setExpandedRunIds] = useState>(() => { + if (sortedRuns.length > 0) { + return new Set([sortedRuns[0]._id]); + } + return new Set(); + }); + + const toggleRun = (runId: string) => { + setExpandedRunIds((prev) => { + const next = new Set(prev); + if (next.has(runId)) { + next.delete(runId); + } else { + next.add(runId); + } + return next; + }); + }; + + // Pre-compute test cases for each run + const runTestCases = useMemo(() => { + const map = new Map(); + for (const run of sortedRuns) { + const runIterations = allIterations.filter( + (iter) => iter.suiteRunId === run._id, + ); + const testCases: RunTestCase[] = runIterations.map((iter) => ({ + testCaseId: iter.testCaseId ?? "", + title: + iter.testCaseSnapshot?.title || "Untitled test", + result: computeIterationResult(iter), + duration: + iter.startedAt && iter.updatedAt + ? iter.updatedAt - iter.startedAt + : 0, + model: iter.testCaseSnapshot?.model, + })); + // Sort: failed first, then passed, then pending + testCases.sort((a, b) => { + const order = { failed: 0, pending: 1, cancelled: 2, passed: 3 }; + return (order[a.result] ?? 4) - (order[b.result] ?? 4); + }); + map.set(run._id, testCases); + } + return map; + }, [sortedRuns, allIterations]); + + if (sortedRuns.length === 0) { + return ( +
+ No runs yet. Run your suite to see results here. +
+ ); + } + + const metricLabel = suite.source === "sdk" ? "Pass Rate" : "Accuracy"; + + return ( +
+ {sortedRuns.map((run, index) => { + const isExpanded = expandedRunIds.has(run._id); + const testCases = runTestCases.get(run._id) ?? []; + const passed = testCases.filter((t) => t.result === "passed").length; + const failed = testCases.filter((t) => t.result === "failed").length; + const total = passed + failed; + const passRate = total > 0 ? Math.round((passed / total) * 100) : null; + + const runResult = + run.result || + (run.status === "completed" && passRate !== null + ? passRate >= (run.passCriteria?.minimumPassRate ?? 100) + ? "passed" + : "failed" + : run.status === "cancelled" + ? "cancelled" + : "pending"); + const borderColor = getIterationBorderColor(runResult); + + const duration = + run.completedAt && run.createdAt + ? formatDuration(run.completedAt - run.createdAt) + : run.createdAt && run.status === "running" + ? formatDuration(Date.now() - run.createdAt) + : null; + + const timestamp = run.completedAt ?? run.createdAt; + const timeAgo = timestamp ? formatTimeAgo(timestamp) : null; + + const creator = run.createdBy && userMap?.get(run.createdBy); + + const showCiMetadata = + !!run.ciMetadata?.branch || + !!run.ciMetadata?.commitSha || + !!run.ciMetadata?.runUrl; + + return ( +
+ {/* Colored left border */} +
+ + {/* Run header — clickable to expand/collapse */} + + + {/* Expanded test cases */} + {isExpanded && ( +
+ {testCases.length === 0 ? ( +
+ {run.status === "running" || run.status === "pending" + ? "Tests are still running..." + : "No test results."} +
+ ) : ( +
+ {testCases.map((tc, tcIndex) => { + const resultIcon = + tc.result === "passed" + ? "text-green-500" + : tc.result === "failed" + ? "text-red-500" + : "text-muted-foreground"; + + return ( + + ); + })} +
+ )} + + {/* "View full run details" link */} + +
+ )} +
+ ); + })} +
+ ); +} + +function formatTimeAgo(timestamp: number): string { + const diff = Date.now() - timestamp; + const minutes = Math.floor(diff / 60000); + if (minutes < 1) return "just now"; + if (minutes < 60) return `${minutes}m ago`; + const hours = Math.floor(minutes / 60); + if (hours < 24) return `${hours}h ago`; + const days = Math.floor(hours / 24); + return `${days}d ago`; +} + +function formatDuration(durationMs: number): string { + if (durationMs < 1000) return `${Math.round(durationMs)}ms`; + const totalSeconds = Math.round(durationMs / 1000); + if (totalSeconds < 60) return `${totalSeconds}s`; + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + if (minutes < 60) return seconds ? `${minutes}m ${seconds}s` : `${minutes}m`; + const hours = Math.floor(minutes / 60); + const remainingMinutes = minutes % 60; + return remainingMinutes ? `${hours}h ${remainingMinutes}m` : `${hours}h`; +} diff --git a/mcpjam-inspector/client/src/components/evals/run-detail-view.tsx b/mcpjam-inspector/client/src/components/evals/run-detail-view.tsx index 76377e54c..37fe8f079 100644 --- a/mcpjam-inspector/client/src/components/evals/run-detail-view.tsx +++ b/mcpjam-inspector/client/src/components/evals/run-detail-view.tsx @@ -1,21 +1,11 @@ import { useMemo } from "react"; -import { X, Loader2, CheckCircle2, XCircle, AlertTriangle } from "lucide-react"; -import { Button } from "@/components/ui/button"; +import { Loader2, CheckCircle2, XCircle } from "lucide-react"; import { ChartContainer, ChartTooltip, ChartTooltipContent, } from "@/components/ui/chart"; -import { - BarChart, - Bar, - CartesianGrid, - PieChart, - Pie, - XAxis, - YAxis, - Label, -} from "recharts"; +import { PieChart, Pie, Label } from "recharts"; import { PassCriteriaBadge } from "./pass-criteria-badge"; import { IterationDetails } from "./iteration-details"; import { getIterationBorderColor } from "./helpers"; @@ -288,6 +278,40 @@ export function RunDetailView({ metricLabel={metricLabel} />
+ + {/* Inline model performance (only when ≥2 models) */} + {selectedRunChartData.modelData.length >= 2 && ( +
+ + By Model: + + {selectedRunChartData.modelData.map((model) => ( +
+
= 80 + ? "hsl(142.1 76.2% 36.3%)" + : model.passRate >= 50 + ? "hsl(45.4 93.4% 47.5%)" + : "hsl(0 84.2% 60.2%)", + }} + /> + {model.model} + + {model.passRate}% + + + ({model.passed}/{model.total}) + +
+ ))} +
+ )}
@@ -316,15 +340,12 @@ export function RunDetailView({ No iterations found. ) : ( - caseGroupsForSelectedRun.map((iteration, idx) => ( - onSelectIteration(iteration._id)} - /> - )) + )} @@ -356,294 +377,113 @@ export function RunDetailView({ - {/* Run Summary Sidebar */} - {showRunSummarySidebar && ( - <> -
setShowRunSummarySidebar(false)} - /> +
+ ); +} -
-
-
Run Summary
- -
+// Iteration list with section headers when sorted by result +function IterationListWithSections({ + iterations, + sortBy, + selectedIterationId, + onSelectIteration, +}: { + iterations: EvalIteration[]; + sortBy: "model" | "test" | "result"; + selectedIterationId: string | null; + onSelectIteration: (id: string) => void; +}) { + if (sortBy !== "result") { + // No sections — just render flat list + return ( + <> + {iterations.map((iteration, idx) => ( + onSelectIteration(iteration._id)} + /> + ))} + + ); + } -
- {/* Charts */} - {(selectedRunChartData.durationData.length > 0 || - selectedRunChartData.tokensData.length > 0 || - selectedRunChartData.modelData.length > 0) && ( -
- {/* Duration per Test Bar Chart */} - {selectedRunChartData.durationData.length > 0 && ( -
-
- Duration per Test -
- - - - { - if (value.length > 20) { - return value.substring(0, 17) + "..."; - } - return value; - }} - /> - `${value.toFixed(1)}s`} - /> - { - if (!active || !payload || payload.length === 0) - return null; - const data = payload[0].payload; - return ( -
-
- {data.name} -
-
- {data.durationSeconds.toFixed(2)}s -
-
- ); - }} - /> - -
-
-
- )} + // Group by result: failing first, then passing, then pending/cancelled + const failing = iterations.filter( + (i) => computeIterationResult(i) === "failed", + ); + const passing = iterations.filter( + (i) => computeIterationResult(i) === "passed", + ); + const other = iterations.filter((i) => { + const r = computeIterationResult(i); + return r !== "failed" && r !== "passed"; + }); - {/* Tokens per Test Bar Chart */} - {selectedRunChartData.tokensData.length > 0 && ( -
-
- Tokens per Test -
- - - - { - if (value.length > 20) { - return value.substring(0, 17) + "..."; - } - return value; - }} - /> - value.toLocaleString()} - /> - { - if (!active || !payload || payload.length === 0) - return null; - const data = payload[0].payload; - return ( -
-
- {data.name} -
-
- {Math.round(data.tokens).toLocaleString()}{" "} - tokens -
-
- ); - }} - /> - -
-
-
- )} + let globalIdx = 0; - {/* Per-Model Performance for this run */} - {selectedRunChartData.modelData.length > 1 && ( -
-
- Performance by model -
- - - - { - if (value.length > 15) { - return value.substring(0, 12) + "..."; - } - return value; - }} - /> - `${value}%`} - /> - { - if (!active || !payload || payload.length === 0) - return null; - const data = payload[0].payload; - return ( -
-
-
- - {data.model} - - - {data.passed} passed · {data.failed}{" "} - failed - -
-
-
- - {data.passRate}% - -
-
-
- ); - }} - /> - - - -
- )} -
- )} -
+ return ( + <> + {failing.length > 0 && ( + <> +
+ Failing ({failing.length}) +
+ {failing.map((iteration) => { + globalIdx++; + return ( + onSelectIteration(iteration._id)} + /> + ); + })} + + )} + {passing.length > 0 && ( + <> +
+ Passing ({passing.length})
+ {passing.map((iteration) => { + globalIdx++; + return ( + onSelectIteration(iteration._id)} + /> + ); + })} )} -
+ {other.length > 0 && ( + <> +
+ Pending / Cancelled ({other.length}) +
+ {other.map((iteration) => { + globalIdx++; + return ( + onSelectIteration(iteration._id)} + /> + ); + })} + + )} + ); } diff --git a/mcpjam-inspector/client/src/components/evals/run-overview.tsx b/mcpjam-inspector/client/src/components/evals/run-overview.tsx index 600da2c31..58ce62076 100644 --- a/mcpjam-inspector/client/src/components/evals/run-overview.tsx +++ b/mcpjam-inspector/client/src/components/evals/run-overview.tsx @@ -17,13 +17,6 @@ import { TooltipContent, TooltipTrigger, } from "@/components/ui/tooltip"; -import { - ChartContainer, - ChartTooltip, - ChartTooltipContent, -} from "@/components/ui/chart"; -import { Bar, BarChart, CartesianGrid, XAxis, YAxis } from "recharts"; -import { AccuracyChart } from "./accuracy-chart"; import { formatRunId, getIterationBorderColor } from "./helpers"; import { computeIterationResult } from "./pass-criteria"; import { EvalIteration, EvalSuiteRun } from "./types"; @@ -326,132 +319,8 @@ export function RunOverview({ }); }, [selectedRunIds, onDirectDeleteRun]); - const modelChartConfig = { - passRate: { - label: "Pass Rate", - color: "var(--chart-1)", - }, - }; - return ( <> - {/* Charts Side by Side */} -
- {/* Accuracy */} -
-
-
- {suite.source === "sdk" ? "Pass Rate Trend" : "Accuracy Trend"} -
-
-
- -
-
- - {/* Per-Model Performance */} -
-
-
- Performance by model -
-
-
- {modelStats.length > 1 ? ( - - - - { - if (value.length > 15) { - return value.substring(0, 12) + "..."; - } - return value; - }} - /> - `${value}%`} - /> - { - if (!active || !payload || payload.length === 0) - return null; - const data = payload[0].payload; - return ( -
-
-
- - {data.model} - - - {data.passed} passed · {data.failed} failed - -
-
-
- - {data.passRate}% - -
-
-
- ); - }} - /> - - - - ) : ( -

- No model data available. -

- )} -
-
-
- {/* Runs List */}
{selectedRunIds.size > 0 ? ( diff --git a/mcpjam-inspector/client/src/components/evals/suite-header.tsx b/mcpjam-inspector/client/src/components/evals/suite-header.tsx index 6b22e96a8..d71de8d9c 100644 --- a/mcpjam-inspector/client/src/components/evals/suite-header.tsx +++ b/mcpjam-inspector/client/src/components/evals/suite-header.tsx @@ -15,12 +15,6 @@ import { DropdownMenuSubTrigger, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; -import { - ChartContainer, - ChartTooltip, - ChartTooltipContent, -} from "@/components/ui/chart"; -import { PieChart, Pie, Label } from "recharts"; import { BarChart3, Loader2, Plus, RotateCw, Trash2, X } from "lucide-react"; import { formatRunId } from "./helpers"; import { @@ -32,7 +26,7 @@ import { } from "./types"; import { useMutation } from "convex/react"; import { toast } from "sonner"; -import { computeIterationResult } from "./pass-criteria"; + import type { ModelDefinition } from "@/shared/types"; import { isMCPJamProvidedModel } from "@/shared/types"; import { ProviderLogo } from "@/components/chat-v2/chat-input/model/provider-logo"; @@ -189,65 +183,6 @@ export function SuiteHeader({ [suiteModels, onUpdateModels], ); - // Calculate accuracy chart data from active runs - const accuracyChartData = useMemo(() => { - if (!runs || !allIterations || runs.length === 0) { - return null; - } - - // Filter to active runs only - const activeRuns = runs.filter((run) => run.isActive !== false); - if (activeRuns.length === 0) { - return null; - } - - // Get all iterations from active runs - const activeRunIds = new Set(activeRuns.map((run) => run._id)); - const activeIterations = allIterations.filter( - (iter) => iter.suiteRunId && activeRunIds.has(iter.suiteRunId), - ); - - if (activeIterations.length === 0) { - return null; - } - - // Calculate passed/failed counts using consistent computation - // Only count completed iterations - exclude pending/cancelled - const iterationResults = activeIterations.map((iter) => - computeIterationResult(iter), - ); - const passed = iterationResults.filter((r) => r === "passed").length; - const failed = iterationResults.filter((r) => r === "failed").length; - const total = passed + failed; // Only count completed iterations for accuracy - - if (total === 0) { - return null; - } - - // Build donut chart data - const donutData = []; - if (passed > 0) { - donutData.push({ - name: "passed", - value: passed, - fill: "hsl(142.1 76.2% 36.3%)", - }); - } - if (failed > 0) { - donutData.push({ - name: "failed", - value: failed, - fill: "hsl(0 84.2% 60.2%)", - }); - } - - return { - donutData, - total, - accuracy: Math.round((passed / total) * 100), - }; - }, [runs, allIterations]); - const latestRunForMetadata = useMemo(() => { if (!runs || runs.length === 0) return null; return [...runs].sort((a, b) => { @@ -342,19 +277,22 @@ export function SuiteHeader({ return (
-

- Run {formatRunId(selectedRunDetails._id)} -

+
+
+ + / + Run +
+

+ Run {formatRunId(selectedRunDetails._id)} +

+
- {!readOnlyConfig && (isRunInProgress ? ( @@ -410,18 +348,6 @@ export function SuiteHeader({ ))} - - - Delete this test suite -
); diff --git a/mcpjam-inspector/client/src/components/evals/suite-hero-stats.tsx b/mcpjam-inspector/client/src/components/evals/suite-hero-stats.tsx new file mode 100644 index 000000000..339cf2619 --- /dev/null +++ b/mcpjam-inspector/client/src/components/evals/suite-hero-stats.tsx @@ -0,0 +1,342 @@ +import { useMemo } from "react"; +import { + ChartContainer, + ChartTooltip, + ChartTooltipContent, +} from "@/components/ui/chart"; +import { PieChart, Pie, Label } from "recharts"; +import { Area, AreaChart } from "recharts"; +import { computeIterationResult } from "./pass-criteria"; +import type { EvalIteration, EvalSuiteRun } from "./types"; + +interface SuiteHeroStatsProps { + runs: EvalSuiteRun[]; + allIterations: EvalIteration[]; + runTrendData: Array<{ + runId: string; + runIdDisplay: string; + passRate: number; + label: string; + }>; + modelStats: Array<{ + model: string; + passRate: number; + passed: number; + failed: number; + total: number; + }>; + testCaseCount: number; + isSDK: boolean; + onRunClick?: (runId: string) => void; +} + +export function SuiteHeroStats({ + runs, + allIterations, + runTrendData, + modelStats, + testCaseCount, + isSDK, + onRunClick, +}: SuiteHeroStatsProps) { + const stats = useMemo(() => { + const activeRuns = runs.filter((run) => run.isActive !== false); + if (activeRuns.length === 0) return null; + + const activeRunIds = new Set(activeRuns.map((r) => r._id)); + const activeIterations = allIterations.filter( + (iter) => iter.suiteRunId && activeRunIds.has(iter.suiteRunId), + ); + + const results = activeIterations.map((iter) => computeIterationResult(iter)); + const passed = results.filter((r) => r === "passed").length; + const failed = results.filter((r) => r === "failed").length; + const total = passed + failed; + + if (total === 0) return null; + + const accuracy = Math.round((passed / total) * 100); + + // Latest run info + const latestRun = [...activeRuns].sort((a, b) => { + const aTime = a.completedAt ?? a.createdAt ?? 0; + const bTime = b.completedAt ?? b.createdAt ?? 0; + return bTime - aTime; + })[0]; + + const latestRunTime = latestRun?.completedAt ?? latestRun?.createdAt; + const latestRunAgo = latestRunTime ? formatTimeAgo(latestRunTime) : null; + + // Latest run pass/fail + const latestRunIterations = allIterations.filter( + (iter) => iter.suiteRunId === latestRun?._id, + ); + const latestResults = latestRunIterations.map((iter) => + computeIterationResult(iter), + ); + const latestPassed = latestResults.filter((r) => r === "passed").length; + const latestTotal = latestResults.filter( + (r) => r === "passed" || r === "failed", + ).length; + + // Avg duration across runs + const completedRuns = activeRuns.filter( + (r) => r.completedAt && r.createdAt, + ); + const avgDuration = + completedRuns.length > 0 + ? completedRuns.reduce( + (sum, r) => sum + ((r.completedAt ?? 0) - (r.createdAt ?? 0)), + 0, + ) / completedRuns.length + : 0; + + return { + accuracy, + passed, + failed, + total, + runCount: activeRuns.length, + latestRunAgo, + latestPassed, + latestTotal, + avgDuration, + donutData: [ + ...(passed > 0 + ? [ + { + name: "passed", + value: passed, + fill: "hsl(142.1 76.2% 36.3%)", + }, + ] + : []), + ...(failed > 0 + ? [ + { + name: "failed", + value: failed, + fill: "hsl(0 84.2% 60.2%)", + }, + ] + : []), + ], + }; + }, [runs, allIterations]); + + if (!stats) { + return ( +
+ No completed runs yet. Run your suite to see results. +
+ ); + } + + const metricLabel = isSDK ? "Pass Rate" : "Accuracy"; + const showTrend = runTrendData.length >= 3; + const showModelComparison = modelStats.length >= 2; + + return ( +
+
+ {/* Big accuracy ring */} +
+ + + } /> + + + + +
+ + {/* Stats */} +
+
+ {stats.accuracy}% + {metricLabel} +
+
+ {testCaseCount} tests + | + {stats.runCount} runs + | + Avg {formatDuration(stats.avgDuration)} + {stats.latestRunAgo && ( + <> + | + + Latest: {stats.latestRunAgo} — {stats.latestPassed}/ + {stats.latestTotal} passed + + + )} +
+ {/* Pass/fail progress bar */} +
+
+
+
+
+ + {stats.passed} passed · {stats.failed} failed + +
+
+ + {/* Sparkline trend (only if ≥3 runs) */} + {showTrend && ( +
+
Trend
+ + { + if (chartData?.activePayload?.[0]?.payload?.runId) { + onRunClick( + chartData.activePayload[0].payload.runId, + ); + } + } + : undefined + } + > + + + +
+ )} +
+ + {/* Model comparison row (only if ≥2 models) */} + {showModelComparison && ( +
+
+ Performance by Model +
+
+ {modelStats.map((model) => ( +
+
= 80 + ? "hsl(142.1 76.2% 36.3%)" + : model.passRate >= 50 + ? "hsl(45.4 93.4% 47.5%)" + : "hsl(0 84.2% 60.2%)", + }} + /> + + {model.model} + + + {model.passRate}% + + + ({model.passed}/{model.total}) + +
+ ))} +
+
+ )} +
+ ); +} + +function formatTimeAgo(timestamp: number): string { + const diff = Date.now() - timestamp; + const minutes = Math.floor(diff / 60000); + if (minutes < 1) return "just now"; + if (minutes < 60) return `${minutes}m ago`; + const hours = Math.floor(minutes / 60); + if (hours < 24) return `${hours}h ago`; + const days = Math.floor(hours / 24); + return `${days}d ago`; +} + +function formatDuration(durationMs: number): string { + if (durationMs < 1000) return `${Math.round(durationMs)}ms`; + const totalSeconds = Math.round(durationMs / 1000); + if (totalSeconds < 60) return `${totalSeconds}s`; + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + if (minutes < 60) return seconds ? `${minutes}m ${seconds}s` : `${minutes}m`; + const hours = Math.floor(minutes / 60); + const remainingMinutes = minutes % 60; + return remainingMinutes ? `${hours}h ${remainingMinutes}m` : `${hours}h`; +} diff --git a/mcpjam-inspector/client/src/components/evals/suite-iterations-view.tsx b/mcpjam-inspector/client/src/components/evals/suite-iterations-view.tsx index e42963a1a..04fed89be 100644 --- a/mcpjam-inspector/client/src/components/evals/suite-iterations-view.tsx +++ b/mcpjam-inspector/client/src/components/evals/suite-iterations-view.tsx @@ -311,6 +311,13 @@ export function SuiteIterationsView({ serverNames={(suite.environment?.servers || []).filter( (name) => connectedServerNames.has(name), )} + suiteName={suite.name} + onNavigateToSuite={() => { + navigateToEvalsRoute({ + type: "suite-overview", + suiteId: suite._id, + }); + }} onBack={() => { navigateToEvalsRoute({ type: "suite-overview", diff --git a/mcpjam-inspector/client/src/components/evals/test-case-detail-view.tsx b/mcpjam-inspector/client/src/components/evals/test-case-detail-view.tsx index 86b51c48f..b19713e59 100644 --- a/mcpjam-inspector/client/src/components/evals/test-case-detail-view.tsx +++ b/mcpjam-inspector/client/src/components/evals/test-case-detail-view.tsx @@ -2,13 +2,6 @@ import { useMemo, useState } from "react"; import { X, ChevronDown, ChevronRight, Loader2 } from "lucide-react"; import { Button } from "@/components/ui/button"; import { Label } from "@/components/ui/label"; -import { AccuracyChart } from "./accuracy-chart"; -import { - ChartContainer, - ChartTooltip, - ChartTooltipContent, -} from "@/components/ui/chart"; -import { Bar, BarChart, CartesianGrid, XAxis, YAxis } from "recharts"; import { computeIterationResult } from "./pass-criteria"; import { getIterationBorderColor, formatRunId } from "./helpers"; import { IterationDetails } from "./iteration-details"; @@ -21,6 +14,8 @@ interface TestCaseDetailViewProps { onBack: () => void; onViewRun?: (runId: string) => void; serverNames?: string[]; + suiteName?: string; + onNavigateToSuite?: () => void; } export function TestCaseDetailView({ @@ -30,6 +25,8 @@ export function TestCaseDetailView({ onBack, onViewRun, serverNames = [], + suiteName, + onNavigateToSuite, }: TestCaseDetailViewProps) { const [openIterationId, setOpenIterationId] = useState(null); @@ -43,59 +40,6 @@ export function TestCaseDetailView({ ); }, [iterations, runs]); - // Performance trend data - const trendData = useMemo(() => { - const iterationsByRun = new Map(); - activeIterations.forEach((iteration) => { - if (iteration.suiteRunId) { - if (!iterationsByRun.has(iteration.suiteRunId)) { - iterationsByRun.set(iteration.suiteRunId, []); - } - iterationsByRun.get(iteration.suiteRunId)!.push(iteration); - } - }); - - const data: Array<{ - runId: string; - runIdDisplay: string; - passRate: number; - label: string; - }> = []; - - runs.forEach((run) => { - // Skip inactive runs - if (run.isActive === false) return; - - const runIters = iterationsByRun.get(run._id); - if (runIters && runIters.length > 0) { - // Only count completed iterations - exclude pending/cancelled - const iterationResults = runIters.map((iter) => - computeIterationResult(iter), - ); - const passed = iterationResults.filter((r) => r === "passed").length; - const total = iterationResults.filter( - (r) => r === "passed" || r === "failed", - ).length; - const passRate = total > 0 ? Math.round((passed / total) * 100) : 0; - - data.push({ - runId: run._id, - runIdDisplay: run._id.slice(-6), - passRate, - label: new Date(run.completedAt ?? run.createdAt).toLocaleString(), - }); - } - }); - - return data.sort((a, b) => { - const runA = runs.find((r) => r._id === a.runId); - const runB = runs.find((r) => r._id === b.runId); - const timeA = runA?.createdAt ?? 0; - const timeB = runB?.createdAt ?? 0; - return timeA - timeB; - }); - }, [activeIterations, runs]); - // Model breakdown const modelBreakdown = useMemo(() => { const modelMap = new Map< @@ -152,18 +96,57 @@ export function TestCaseDetailView({ .sort((a, b) => b.passRate - a.passRate); }, [activeIterations]); - const modelChartConfig = { - passRate: { - label: "Pass Rate", - color: "var(--chart-1)", - }, + // Compute overall stats + const overallStats = useMemo(() => { + const results = activeIterations.map((i) => computeIterationResult(i)); + const passed = results.filter((r) => r === "passed").length; + const failed = results.filter((r) => r === "failed").length; + const total = passed + failed; + const passRate = total > 0 ? Math.round((passed / total) * 100) : 0; + + // Avg duration + const completed = activeIterations.filter( + (i) => i.startedAt && i.updatedAt && i.result !== "pending", + ); + const avgDuration = + completed.length > 0 + ? completed.reduce( + (sum, i) => sum + ((i.updatedAt ?? 0) - (i.startedAt ?? 0)), + 0, + ) / completed.length + : 0; + + return { passed, failed, total, passRate, avgDuration }; + }, [activeIterations]); + + const formatDurationHelper = (ms: number) => { + if (ms < 1000) return `${Math.round(ms)}ms`; + const s = Math.round(ms / 1000); + if (s < 60) return `${s}s`; + const m = Math.floor(s / 60); + const sec = s % 60; + return sec ? `${m}m ${sec}s` : `${m}m`; }; return ( -
- {/* Header */} +
+ {/* Breadcrumb + Header */}
+
+ {suiteName && onNavigateToSuite && ( + <> + + / + + )} + Test Case +

{testCase.title || "Untitled test case"}

@@ -178,120 +161,75 @@ export function TestCaseDetailView({
- {/* Charts Side by Side */} -
- {/* Performance Chart */} - {trendData.length > 0 && ( -
-
-
- Performance across runs -
-
-
- 0 && ( +
+
+ {overallStats.passRate}% + Pass Rate + | + + {overallStats.total} iterations + + | + + Avg {formatDurationHelper(overallStats.avgDuration)} + +
+ {/* Progress bar */} +
+
+
+
+ + {overallStats.passed} passed · {overallStats.failed} failed +
- )} - - {/* Model Breakdown */} - {modelBreakdown.length > 0 && ( -
-
-
- Performance by model -
-
-
- - - - { - const parts = value.split("/"); - if (parts.length === 2 && parts[1].length > 15) { - return `${parts[0]}/${parts[1].substring(0, 12)}...`; - } - return value; - }} - /> - `${value}%`} - /> - { - if (!active || !payload || payload.length === 0) - return null; - const data = payload[0].payload; - return ( -
-
-
- - {data.model} - - - {data.passed} passed · {data.failed} failed - -
-
-
- - {data.passRate}% - -
-
-
- ); + {/* Inline model breakdown */} + {modelBreakdown.length >= 1 && ( +
+ + By Model: + + {modelBreakdown.map((model) => ( +
+
= 80 + ? "hsl(142.1 76.2% 36.3%)" + : model.passRate >= 50 + ? "hsl(45.4 93.4% 47.5%)" + : "hsl(0 84.2% 60.2%)", }} /> - - - + {model.model} + + {model.passRate}% + + + ({model.passed}/{model.passed + model.failed}) + +
+ ))}
-
- )} -
+ )} +
+ )} {/* Iterations List */}
@@ -304,7 +242,34 @@ export function TestCaseDetailView({
) : (
- {activeIterations.map((iteration) => { + {/* Column headers */} +
+
+
+ Result +
+
+
Model
+
Calls
+
Tokens
+
Time
+ {onViewRun &&
Run
} +
+
+ {/* Failing iterations first */} + {(() => { + const failing = activeIterations.filter( + (i) => computeIterationResult(i) === "failed", + ); + const passing = activeIterations.filter( + (i) => computeIterationResult(i) === "passed", + ); + const other = activeIterations.filter((i) => { + const r = computeIterationResult(i); + return r !== "failed" && r !== "passed"; + }); + return [...failing, ...passing, ...other]; + })().map((iteration) => { const snapshot = iteration.testCaseSnapshot; const startedAt = iteration.startedAt ?? iteration.createdAt; const completedAt = iteration.updatedAt ?? iteration.createdAt; @@ -393,7 +358,7 @@ export function TestCaseDetailView({ onViewRun(iteration.suiteRunId!); }} > - Run {formatRunId(iteration.suiteRunId)} + {formatTimeAgo(iteration.createdAt)}
)} @@ -422,3 +387,14 @@ export function TestCaseDetailView({
); } + +function formatTimeAgo(timestamp: number): string { + const diff = Date.now() - timestamp; + const minutes = Math.floor(diff / 60000); + if (minutes < 1) return "just now"; + if (minutes < 60) return `${minutes}m ago`; + const hours = Math.floor(minutes / 60); + if (hours < 24) return `${hours}h ago`; + const days = Math.floor(hours / 24); + return `${days}d ago`; +} diff --git a/mcpjam-inspector/client/src/components/evals/test-cases-overview.tsx b/mcpjam-inspector/client/src/components/evals/test-cases-overview.tsx index 9328d5b51..be4129702 100644 --- a/mcpjam-inspector/client/src/components/evals/test-cases-overview.tsx +++ b/mcpjam-inspector/client/src/components/evals/test-cases-overview.tsx @@ -1,13 +1,6 @@ import { useMemo } from "react"; import { computeIterationResult } from "./pass-criteria"; import type { EvalCase, EvalIteration, EvalSuiteRun } from "./types"; -import { - ChartContainer, - ChartTooltip, - ChartTooltipContent, -} from "@/components/ui/chart"; -import { Bar, BarChart, CartesianGrid, XAxis, YAxis } from "recharts"; -import { AccuracyChart } from "./accuracy-chart"; interface TestCasesOverviewProps { suite: { _id: string; name: string; source?: "ui" | "sdk" }; @@ -109,132 +102,8 @@ export function TestCasesOverview({ return seconds ? `${minutes}m ${seconds}s` : `${minutes}m`; }; - const modelChartConfig = { - passRate: { - label: "Pass Rate", - color: "var(--chart-1)", - }, - }; - return ( <> - {/* Charts Side by Side */} -
- {/* Accuracy */} -
-
-
- {suite.source === "sdk" ? "Pass Rate" : "Accuracy"} -
-
-
- -
-
- - {/* Per-Model Performance */} -
-
-
- Performance by model -
-
-
- {modelStats.length > 1 ? ( - - - - { - if (value.length > 15) { - return value.substring(0, 12) + "..."; - } - return value; - }} - /> - `${value}%`} - /> - { - if (!active || !payload || payload.length === 0) - return null; - const data = payload[0].payload; - return ( -
-
-
- - {data.model} - - - {data.passed} passed · {data.failed} failed - -
-
-
- - {data.passRate}% - -
-
-
- ); - }} - /> - - - - ) : ( -

- No model data available. -

- )} -
-
-
- {/* Test Cases List */}
diff --git a/mcpjam-inspector/client/src/lib/ci-evals-router.ts b/mcpjam-inspector/client/src/lib/ci-evals-router.ts index b086667d6..d0095ff85 100644 --- a/mcpjam-inspector/client/src/lib/ci-evals-router.ts +++ b/mcpjam-inspector/client/src/lib/ci-evals-router.ts @@ -75,7 +75,7 @@ export function parseCiEvalsRoute(): CiEvalsRoute | null { return { type: "suite-overview", suiteId, - view: view === "test-cases" ? "test-cases" : "runs", + view: view === "runs" ? "runs" : "test-cases", }; } } @@ -98,7 +98,7 @@ export function navigateToCiEvalsRoute( break; case "suite-overview": { const params = new URLSearchParams(); - if (route.view && route.view !== "runs") { + if (route.view && route.view !== "test-cases") { params.set("view", route.view); } const query = params.toString();