Skip to content

Commit 4442397

Browse files
feat(web-evals): enhance dashboard with dynamic tool columns and UX improvements (#9592)
Co-authored-by: Roo Code <[email protected]>
1 parent 3c989d3 commit 4442397

File tree

7 files changed

+727
-62
lines changed

7 files changed

+727
-62
lines changed

apps/web-evals/src/app/runs/[id]/run.tsx

Lines changed: 178 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,36 @@ import { LoaderCircle } from "lucide-react"
55

66
import type { Run, TaskMetrics as _TaskMetrics } from "@roo-code/evals"
77

8-
import { formatCurrency, formatDuration, formatTokens } from "@/lib/formatters"
8+
import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
99
import { useRunStatus } from "@/hooks/use-run-status"
10-
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
10+
import {
11+
Table,
12+
TableBody,
13+
TableCell,
14+
TableHead,
15+
TableHeader,
16+
TableRow,
17+
Tooltip,
18+
TooltipContent,
19+
TooltipTrigger,
20+
} from "@/components/ui"
1121

1222
import { TaskStatus } from "./task-status"
1323
import { RunStatus } from "./run-status"
1424

1525
type TaskMetrics = Pick<_TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">
1626

27+
type ToolUsageEntry = { attempts: number; failures: number }
28+
type ToolUsage = Record<string, ToolUsageEntry>
29+
30+
// Generate abbreviation from tool name (e.g., "read_file" -> "RF", "list_code_definition_names" -> "LCDN")
31+
function getToolAbbreviation(toolName: string): string {
32+
return toolName
33+
.split("_")
34+
.map((word) => word[0]?.toUpperCase() ?? "")
35+
.join("")
36+
}
37+
1738
export function Run({ run }: { run: Run }) {
1839
const runStatus = useRunStatus(run)
1940
const { tasks, tokenUsage, usageUpdatedAt } = runStatus
@@ -41,16 +62,170 @@ export function Run({ run }: { run: Run }) {
4162
// eslint-disable-next-line react-hooks/exhaustive-deps
4263
}, [tasks, tokenUsage, usageUpdatedAt])
4364

65+
// Compute aggregate stats
66+
const stats = useMemo(() => {
67+
if (!tasks) return null
68+
69+
const passed = tasks.filter((t) => t.passed === true).length
70+
const failed = tasks.filter((t) => t.passed === false).length
71+
// Count running tasks exactly like TaskStatus shows spinner:
72+
// - passed is not true and not false (null/undefined)
73+
// - AND has activity (startedAt or tokenUsage)
74+
const running = tasks.filter(
75+
(t) => t.passed !== true && t.passed !== false && (t.startedAt || tokenUsage.get(t.id)),
76+
).length
77+
const pending = tasks.filter(
78+
(t) => t.passed !== true && t.passed !== false && !t.startedAt && !tokenUsage.get(t.id),
79+
).length
80+
const total = tasks.length
81+
const completed = passed + failed
82+
83+
let totalTokensIn = 0
84+
let totalTokensOut = 0
85+
let totalCost = 0
86+
let totalDuration = 0
87+
88+
// Aggregate tool usage from completed tasks
89+
const toolUsage: ToolUsage = {}
90+
91+
for (const task of tasks) {
92+
const metrics = taskMetrics[task.id]
93+
if (metrics) {
94+
totalTokensIn += metrics.tokensIn
95+
totalTokensOut += metrics.tokensOut
96+
totalCost += metrics.cost
97+
totalDuration += metrics.duration
98+
}
99+
100+
// Aggregate tool usage from finished tasks with taskMetrics
101+
if (task.finishedAt && task.taskMetrics?.toolUsage) {
102+
for (const [key, usage] of Object.entries(task.taskMetrics.toolUsage)) {
103+
const tool = key as keyof ToolUsage
104+
if (!toolUsage[tool]) {
105+
toolUsage[tool] = { attempts: 0, failures: 0 }
106+
}
107+
toolUsage[tool].attempts += usage.attempts
108+
toolUsage[tool].failures += usage.failures
109+
}
110+
}
111+
}
112+
113+
return {
114+
passed,
115+
failed,
116+
running,
117+
pending,
118+
total,
119+
completed,
120+
passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null,
121+
totalTokensIn,
122+
totalTokensOut,
123+
totalCost,
124+
totalDuration,
125+
toolUsage,
126+
}
127+
// eslint-disable-next-line react-hooks/exhaustive-deps
128+
}, [tasks, taskMetrics, tokenUsage, usageUpdatedAt])
129+
44130
return (
45131
<>
46132
<div>
47-
<div className="mb-2">
133+
<div className="mb-4">
48134
<div>
49135
<div className="font-mono">{run.model}</div>
50136
{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
51137
</div>
52138
{!run.taskMetricsId && <RunStatus runStatus={runStatus} />}
53139
</div>
140+
141+
{stats && (
142+
<div className="mb-4 p-4 border rounded-lg bg-muted/50">
143+
{/* Main Stats Row */}
144+
<div className="flex flex-wrap items-start justify-between gap-x-6 gap-y-3">
145+
{/* Passed/Failed */}
146+
<div className="text-center">
147+
<div className="text-2xl font-bold whitespace-nowrap">
148+
<span className="text-green-600">{stats.passed}</span>
149+
<span className="text-muted-foreground mx-1">/</span>
150+
<span className="text-red-600">{stats.failed}</span>
151+
{stats.running > 0 && (
152+
<span className="text-yellow-600 text-sm ml-2">({stats.running})</span>
153+
)}
154+
</div>
155+
<div className="text-xs text-muted-foreground">Passed / Failed</div>
156+
</div>
157+
158+
{/* Pass Rate */}
159+
<div className="text-center">
160+
<div className="text-2xl font-bold">{stats.passRate ? `${stats.passRate}%` : "-"}</div>
161+
<div className="text-xs text-muted-foreground">Pass Rate</div>
162+
</div>
163+
164+
{/* Tokens */}
165+
<div className="text-center">
166+
<div className="text-xl font-bold font-mono whitespace-nowrap">
167+
{formatTokens(stats.totalTokensIn)}
168+
<span className="text-muted-foreground mx-1">/</span>
169+
{formatTokens(stats.totalTokensOut)}
170+
</div>
171+
<div className="text-xs text-muted-foreground">Tokens In / Out</div>
172+
</div>
173+
174+
{/* Cost */}
175+
<div className="text-center">
176+
<div className="text-2xl font-bold font-mono">{formatCurrency(stats.totalCost)}</div>
177+
<div className="text-xs text-muted-foreground">Cost</div>
178+
</div>
179+
180+
{/* Duration */}
181+
<div className="text-center">
182+
<div className="text-2xl font-bold font-mono whitespace-nowrap">
183+
{stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"}
184+
</div>
185+
<div className="text-xs text-muted-foreground">Duration</div>
186+
</div>
187+
188+
{/* Tool Usage - Inline */}
189+
{Object.keys(stats.toolUsage).length > 0 && (
190+
<div className="flex items-center gap-2 flex-wrap">
191+
{Object.entries(stats.toolUsage)
192+
.sort(([, a], [, b]) => b.attempts - a.attempts)
193+
.map(([toolName, usage]) => {
194+
const abbr = getToolAbbreviation(toolName)
195+
const successRate =
196+
usage.attempts > 0
197+
? ((usage.attempts - usage.failures) / usage.attempts) * 100
198+
: 100
199+
const rateColor =
200+
successRate === 100
201+
? "text-green-500"
202+
: successRate >= 80
203+
? "text-yellow-500"
204+
: "text-red-500"
205+
return (
206+
<Tooltip key={toolName}>
207+
<TooltipTrigger asChild>
208+
<div className="flex items-center gap-1 px-2 py-1 rounded bg-background/50 border border-border/50 hover:border-border transition-colors cursor-default text-xs">
209+
<span className="font-medium text-muted-foreground">
210+
{abbr}
211+
</span>
212+
<span className="font-bold tabular-nums">
213+
{usage.attempts}
214+
</span>
215+
<span className={`${rateColor}`}>
216+
{formatToolUsageSuccessRate(usage)}
217+
</span>
218+
</div>
219+
</TooltipTrigger>
220+
<TooltipContent side="bottom">{toolName}</TooltipContent>
221+
</Tooltip>
222+
)
223+
})}
224+
</div>
225+
)}
226+
</div>
227+
</div>
228+
)}
54229
{!tasks ? (
55230
<LoaderCircle className="size-4 animate-spin" />
56231
) : (

0 commit comments

Comments
 (0)