Skip to content

Commit 87af3b3

Browse files
authored
Record tool usages in the Cline object, and persist them in the db for evals (#2729)
1 parent b5a77e3 commit 87af3b3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+977
-222
lines changed

evals/apps/cli/src/index.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,12 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
275275
})
276276
}
277277

278-
if (eventName === RooCodeEventName.TaskCompleted || eventName === RooCodeEventName.TaskAborted) {
278+
if (eventName === RooCodeEventName.TaskCompleted && taskMetricsId) {
279+
const toolUsage = payload[2]
280+
await updateTaskMetrics(taskMetricsId, { toolUsage })
281+
}
282+
283+
if (eventName === RooCodeEventName.TaskAborted || eventName === RooCodeEventName.TaskCompleted) {
279284
taskFinishedAt = Date.now()
280285
await updateTask(task.id, { finishedAt: new Date() })
281286
}

evals/apps/web/src/app/home.tsx

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { Ellipsis, Rocket } from "lucide-react"
88
import type { Run, TaskMetrics } from "@evals/db"
99

1010
import { deleteRun } from "@/lib/server/runs"
11-
import { formatCurrency, formatDuration, formatTokens } from "@/lib"
11+
import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
1212
import {
1313
Button,
1414
Table,
@@ -59,7 +59,8 @@ export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null
5959
<TableHead>Passed</TableHead>
6060
<TableHead>Failed</TableHead>
6161
<TableHead>% Correct</TableHead>
62-
<TableHead className="text-center">Tokens In / Out</TableHead>
62+
<TableHead>Tokens In / Out</TableHead>
63+
<TableHead>Diff Edits</TableHead>
6364
<TableHead>Cost</TableHead>
6465
<TableHead>Duration</TableHead>
6566
<TableHead />
@@ -79,12 +80,21 @@ export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null
7980
</TableCell>
8081
<TableCell>
8182
{taskMetrics && (
82-
<div className="flex items-center justify-evenly">
83+
<div className="flex items-center gap-1.5">
8384
<div>{formatTokens(taskMetrics.tokensIn)}</div>/
8485
<div>{formatTokens(taskMetrics.tokensOut)}</div>
8586
</div>
8687
)}
8788
</TableCell>
89+
<TableCell>
90+
{taskMetrics?.toolUsage?.apply_diff && (
91+
<div className="flex flex-row items-center gap-1.5">
92+
<div>{taskMetrics.toolUsage.apply_diff.attempts}</div>
93+
<div>/</div>
94+
<div>{formatToolUsageSuccessRate(taskMetrics.toolUsage.apply_diff)}</div>
95+
</div>
96+
)}
97+
</TableCell>
8898
<TableCell>{taskMetrics && formatCurrency(taskMetrics.cost)}</TableCell>
8999
<TableCell>{taskMetrics && formatDuration(taskMetrics.duration)}</TableCell>
90100
<TableCell>

evals/apps/web/src/app/runs/[id]/run.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { LoaderCircle } from "lucide-react"
55

66
import * as db from "@evals/db"
77

8-
import { formatCurrency, formatDuration, formatTokens } from "@/lib"
8+
import { formatCurrency, formatDuration, formatTokens } from "@/lib/formatters"
99
import { useRunStatus } from "@/hooks/use-run-status"
1010
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
1111

evals/apps/web/src/lib/format-currency.ts

Lines changed: 0 additions & 6 deletions
This file was deleted.

evals/apps/web/src/lib/format-duration.ts

Lines changed: 0 additions & 22 deletions
This file was deleted.

evals/apps/web/src/lib/format-tokens.ts

Lines changed: 0 additions & 15 deletions
This file was deleted.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
const formatter = new Intl.NumberFormat("en-US", {
2+
style: "currency",
3+
currency: "USD",
4+
})
5+
6+
export const formatCurrency = (amount: number) => formatter.format(amount)
7+
8+
export const formatDuration = (durationMs: number) => {
9+
const seconds = Math.floor(durationMs / 1000)
10+
const hours = Math.floor(seconds / 3600)
11+
const minutes = Math.floor((seconds % 3600) / 60)
12+
const remainingSeconds = seconds % 60
13+
14+
const parts = []
15+
16+
if (hours > 0) {
17+
parts.push(`${hours}h`)
18+
}
19+
20+
if (minutes > 0) {
21+
parts.push(`${minutes}m`)
22+
}
23+
24+
if (remainingSeconds > 0 || parts.length === 0) {
25+
parts.push(`${remainingSeconds}s`)
26+
}
27+
28+
return parts.join(" ")
29+
}
30+
31+
export const formatTokens = (tokens: number) => {
32+
if (tokens < 1000) {
33+
return tokens.toString()
34+
}
35+
36+
if (tokens < 1000000) {
37+
return `${(tokens / 1000).toFixed(1)}k`
38+
}
39+
40+
if (tokens < 1000000000) {
41+
return `${(tokens / 1000000).toFixed(1)}M`
42+
}
43+
44+
return `${(tokens / 1000000000).toFixed(1)}B`
45+
}
46+
47+
export const formatToolUsageSuccessRate = (usage: { attempts: number; failures: number }) =>
48+
usage.attempts === 0 ? '0%' : `${(((usage.attempts - usage.failures) / usage.attempts) * 100).toFixed(1)}%`

evals/apps/web/src/lib/index.ts

Lines changed: 0 additions & 3 deletions
This file was deleted.

evals/packages/db/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
test.db
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE `taskMetrics` ADD `toolUsage` text;

0 commit comments

Comments
 (0)