Skip to content

Commit edb13dc

Browse files
aeneasrclaude
andauthored
feat(bench-swe): add PTerm-based TUI progress component (#43)
Port the lumen TUI progress component to the bench-swe module, enabling structured terminal output with progress bars, spinners, and styled status messages. Add pterm and golang.org/x/term dependencies to bench-swe/go.mod. Also resolve .gitignore merge conflict preserving all entries. Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 02f2ef8 commit edb13dc

File tree

7 files changed

+354
-55
lines changed

7 files changed

+354
-55
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ dist/
1717
lumen
1818

1919
bench-swe/bench-swe
20-
bench-swe/overview.txt.claude/worktrees/
20+
bench-swe/overview.txt

bench-swe/cmd/analyze.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ package cmd
22

33
import (
44
"fmt"
5+
"os"
56
"path/filepath"
67

78
"github.com/spf13/cobra"
89

910
"github.com/aeneasr/lumen/bench-swe/internal/analysis"
1011
"github.com/aeneasr/lumen/bench-swe/internal/task"
12+
"github.com/aeneasr/lumen/bench-swe/internal/tui"
1113
)
1214

1315
var analyzeCmd = &cobra.Command{
@@ -31,6 +33,11 @@ func runAnalyze(_ *cobra.Command, args []string) error {
3133
return fmt.Errorf("loading tasks: %w", err)
3234
}
3335

34-
fmt.Printf("Analyzing %d tasks from %s...\n", len(tasks), resultsDir)
35-
return analysis.Analyze(resultsDir, benchDir, tasks)
36+
p := tui.NewProgress(os.Stderr)
37+
p.Info(fmt.Sprintf("Analyzing %d tasks from %s", len(tasks), resultsDir))
38+
if err := analysis.Analyze(resultsDir, benchDir, tasks); err != nil {
39+
return err
40+
}
41+
p.Complete("Analysis complete.")
42+
return nil
3643
}

bench-swe/cmd/run.go

Lines changed: 72 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/aeneasr/lumen/bench-swe/internal/report"
1919
"github.com/aeneasr/lumen/bench-swe/internal/runner"
2020
"github.com/aeneasr/lumen/bench-swe/internal/task"
21+
"github.com/aeneasr/lumen/bench-swe/internal/tui"
2122
)
2223

2324
var (
@@ -58,6 +59,8 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
5859
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
5960
defer cancel()
6061

62+
p := tui.NewProgress(os.Stderr)
63+
6164
// Resolve paths
6265
benchDir, err := findBenchDir()
6366
if err != nil {
@@ -90,20 +93,25 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
9093

9194
// Preflight
9295
if !flagSkipPreflight {
96+
p.StartSpinner("Running preflight checks...")
9397
pfCfg := &preflight.Config{
9498
RepoRoot: repoRoot,
9599
LumenBinary: lumenBinary,
96100
Backend: backend,
97101
EmbedModel: flagEmbedModel,
98102
OllamaHost: os.Getenv("OLLAMA_HOST"),
99103
}
100-
if err := preflight.Validate(ctx, pfCfg); err != nil {
101-
return fmt.Errorf("preflight failed: %w", err)
104+
pfErr := preflight.Validate(ctx, pfCfg)
105+
p.StopSpinner()
106+
if pfErr != nil {
107+
return fmt.Errorf("preflight failed: %w", pfErr)
102108
}
103109
}
104110

105111
// Load tasks
112+
p.StartSpinner("Loading tasks...")
106113
tasks, err := task.LoadTasks(tasksDir, flagLanguage)
114+
p.StopSpinner()
107115
if err != nil {
108116
return err
109117
}
@@ -122,13 +130,14 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
122130
}
123131

124132
totalRuns := max(flagRuns, 1)
133+
total := len(tasks) * len(scenarios) * totalRuns
125134

126135
if totalRuns > 1 {
127-
fmt.Printf("\nRunning %d tasks x %d scenarios x %d runs (parallel=%d)\n\n",
128-
len(tasks), len(scenarios), totalRuns, flagParallel)
136+
p.Info(fmt.Sprintf("Running %d tasks × %d scenarios × %d runs (parallel=%d)",
137+
len(tasks), len(scenarios), totalRuns, flagParallel))
129138
} else {
130-
fmt.Printf("\nRunning %d tasks x %d scenarios (parallel=%d)\n\n",
131-
len(tasks), len(scenarios), flagParallel)
139+
p.Info(fmt.Sprintf("Running %d tasks × %d scenarios (parallel=%d)",
140+
len(tasks), len(scenarios), flagParallel))
132141
}
133142

134143
// Run tasks
@@ -144,96 +153,123 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
144153

145154
var mu sync.Mutex
146155
var results []runner.RunResult
156+
var runRows [][]string
157+
completed := 0
158+
159+
p.Start("Running", total)
147160

148161
g, gCtx := errgroup.WithContext(ctx)
149162
g.SetLimit(flagParallel)
150163

151164
for _, t := range tasks {
152165
g.Go(func() error {
153-
var lines []string
154166
var taskResults []runner.RunResult
167+
var taskRows [][]string
155168
for _, s := range scenarios {
156169
for run := 1; run <= totalRuns; run++ {
157170
result, err := runner.Run(gCtx, runCfg, t, s, run)
158-
runLabel := fmt.Sprintf("%-10s", s)
171+
runLabel := string(s)
159172
if totalRuns > 1 {
160-
runLabel = fmt.Sprintf("%-10s run%d", s, run)
173+
runLabel = fmt.Sprintf("%s run%d", s, run)
161174
}
162-
var line string
175+
var row []string
163176
if err != nil {
164-
line = fmt.Sprintf(" %-20s %s ERROR: %v\n", t.ID, runLabel, err)
177+
row = []string{t.ID, runLabel, "—", "—", "—", "ERROR: " + err.Error()}
165178
} else if result != nil && result.Metrics != nil {
166179
m := result.Metrics
167-
durS := float64(m.DurationMS) / 1000.0
168-
line = fmt.Sprintf(" %-20s %s done [%5.1fs $%.4f in=%d+%dcr out=%d]\n",
169-
t.ID, runLabel, durS, m.CostUSD, m.InputTokens, m.CacheRead, m.OutputTokens)
170-
} else if result != nil {
171-
line = fmt.Sprintf(" %-20s %s done (no metrics)\n", t.ID, runLabel)
180+
row = []string{
181+
t.ID,
182+
runLabel,
183+
fmt.Sprintf("%.1fs", float64(m.DurationMS)/1000.0),
184+
fmt.Sprintf("$%.4f", m.CostUSD),
185+
fmt.Sprintf("%d+%dcr/%d", m.InputTokens, m.CacheRead, m.OutputTokens),
186+
"done",
187+
}
188+
} else {
189+
row = []string{t.ID, runLabel, "—", "—", "—", "done (no metrics)"}
172190
}
173-
lines = append(lines, line)
191+
taskRows = append(taskRows, row)
174192
if result != nil {
175193
taskResults = append(taskResults, *result)
176194
}
177195
}
178196
}
179197
mu.Lock()
180-
defer mu.Unlock()
181-
for _, l := range lines {
182-
fmt.Print(l)
183-
}
198+
completed += len(taskRows)
199+
p.Update(completed, t.ID)
200+
runRows = append(runRows, taskRows...)
184201
results = append(results, taskResults...)
202+
mu.Unlock()
185203
return nil
186204
})
187205
}
188206

189207
if err := g.Wait(); err != nil {
208+
p.Stop()
190209
return err
191210
}
211+
p.Stop()
212+
213+
p.PrintTable([]string{"Task", "Scenario", "Time", "Cost", "Tokens (in+cr/out)", "Status"}, runRows)
192214

193215
// Judge (fresh context so a canceled run phase doesn't block judging)
194216
if !flagSkipJudge {
195-
fmt.Println("\nJudging results...")
217+
p.Info("Judging results...")
196218
judgeCtx, judgeCancel := signal.NotifyContext(context.Background(), os.Interrupt)
197219
defer judgeCancel()
220+
198221
var judgeMu sync.Mutex
222+
var judgeRows [][]string
223+
judgeCompleted := 0
224+
judgeTotal := len(tasks) * len(scenarios) * totalRuns
225+
226+
p.Start("Judging", judgeTotal)
227+
199228
judgeG, judgeCtx := errgroup.WithContext(judgeCtx)
200229
judgeG.SetLimit(flagParallel)
201230

202231
for _, t := range tasks {
203232
judgeG.Go(func() error {
204-
var lines []string
233+
var taskRows [][]string
205234
for _, s := range scenarios {
206235
for run := 1; run <= totalRuns; run++ {
207236
slug := runner.Slug(t.ID, s, run, totalRuns)
208237
result, err := judgeTask(judgeCtx, benchDir, runCfg, t, s, slug)
209-
runLabel := fmt.Sprintf("%-10s", s)
238+
runLabel := string(s)
210239
if totalRuns > 1 {
211-
runLabel = fmt.Sprintf("%-10s run%d", s, run)
240+
runLabel = fmt.Sprintf("%s run%d", s, run)
212241
}
213-
var line string
242+
var row []string
214243
if err != nil {
215-
line = fmt.Sprintf(" %-20s %s error: %v\n", t.ID, runLabel, err)
244+
row = []string{t.ID, runLabel, "ERROR: " + err.Error()}
216245
} else if result != nil {
217-
line = fmt.Sprintf(" %-20s %s %s\n", t.ID, runLabel, result.Rating)
246+
row = []string{t.ID, runLabel, string(result.Rating)}
247+
} else {
248+
row = []string{t.ID, runLabel, "—"}
218249
}
219-
lines = append(lines, line)
250+
taskRows = append(taskRows, row)
220251
}
221252
}
222253
judgeMu.Lock()
223-
defer judgeMu.Unlock()
224-
for _, l := range lines {
225-
fmt.Print(l)
226-
}
254+
judgeCompleted += len(taskRows)
255+
p.Update(judgeCompleted, t.ID)
256+
judgeRows = append(judgeRows, taskRows...)
257+
judgeMu.Unlock()
227258
return nil
228259
})
229260
}
261+
230262
if err := judgeG.Wait(); err != nil {
231-
fmt.Printf(" Judge error: %v\n", err)
263+
p.Stop()
264+
p.Error(fmt.Sprintf("Judge error: %v", err))
265+
} else {
266+
p.Stop()
232267
}
268+
p.PrintTable([]string{"Task", "Scenario", "Rating"}, judgeRows)
233269
}
234270

235271
// Reports
236-
fmt.Println("\nGenerating reports...")
272+
p.Info("Generating reports...")
237273
rptCfg := &report.Config{
238274
ResultsDir: resultsDir,
239275
EmbedModel: flagEmbedModel,
@@ -251,7 +287,7 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
251287
return err
252288
}
253289

254-
fmt.Printf("\nResults: %s\n", resultsDir)
290+
p.Complete("Results: " + resultsDir)
255291
return nil
256292
}
257293

bench-swe/cmd/validate.go

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/spf13/cobra"
99

1010
"github.com/aeneasr/lumen/bench-swe/internal/task"
11+
"github.com/aeneasr/lumen/bench-swe/internal/tui"
1112
)
1213

1314
const grepScoreThreshold = 0.5
@@ -41,11 +42,15 @@ func runValidate(_ *cobra.Command, args []string) error {
4142
return err
4243
}
4344

45+
p := tui.NewProgress(os.Stderr)
46+
4447
tasks, err := task.LoadTasks(tasksDir, nil)
4548
if err != nil {
4649
return err
4750
}
4851

52+
p.Info(fmt.Sprintf("Validating %d tasks from %s", len(tasks), tasksDir))
53+
4954
type result struct {
5055
t task.Task
5156
score float64
@@ -79,27 +84,28 @@ func runValidate(_ *cobra.Command, args []string) error {
7984
var failed int
8085
for _, r := range results {
8186
if r.err != "" {
82-
fmt.Printf("ERROR %s: %s\n", r.t.ID, r.err)
87+
p.Error(fmt.Sprintf("%s: %s", r.t.ID, r.err))
8388
failed++
8489
continue
8590
}
86-
label := "OK "
87-
if r.score >= grepScoreThreshold {
88-
label = "REJECT"
89-
failed++
90-
} else if r.score > 0 {
91-
label = "WARN "
92-
}
93-
fmt.Printf("%s %s grep_score=%.0f%%", label, r.t.ID, r.score*100)
91+
msg := fmt.Sprintf("%s grep_score=%.0f%%", r.t.ID, r.score*100)
9492
if len(r.leaked) > 0 {
95-
fmt.Printf(" leaked=%v", r.leaked)
93+
msg += fmt.Sprintf(" leaked=%v", r.leaked)
94+
}
95+
switch {
96+
case r.score >= grepScoreThreshold:
97+
p.Error("REJECT " + msg)
98+
failed++
99+
case r.score > 0:
100+
p.Warn("WARN " + msg)
101+
default:
102+
p.Info("OK " + msg)
96103
}
97-
fmt.Println()
98104
}
99105

100106
if failed > 0 {
101107
return fmt.Errorf("%d task(s) failed validation", failed)
102108
}
103-
fmt.Printf("\nAll %d tasks passed.\n", len(results))
109+
p.Complete(fmt.Sprintf("All %d tasks passed.", len(results)))
104110
return nil
105111
}

bench-swe/go.mod

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,26 @@
11
module github.com/aeneasr/lumen/bench-swe
22

3-
go 1.23.0
3+
go 1.25.0
44

55
require (
6+
github.com/pterm/pterm v0.12.83
67
github.com/spf13/cobra v1.9.1
7-
golang.org/x/sync v0.12.0
8+
golang.org/x/sync v0.19.0
9+
golang.org/x/term v0.41.0
810
)
911

1012
require (
13+
atomicgo.dev/cursor v0.2.0 // indirect
14+
atomicgo.dev/keyboard v0.2.9 // indirect
15+
atomicgo.dev/schedule v0.1.0 // indirect
16+
github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
17+
github.com/containerd/console v1.0.5 // indirect
18+
github.com/gookit/color v1.6.0 // indirect
1119
github.com/inconshreveable/mousetrap v1.1.0 // indirect
20+
github.com/lithammer/fuzzysearch v1.1.8 // indirect
21+
github.com/mattn/go-runewidth v0.0.20 // indirect
1222
github.com/spf13/pflag v1.0.6 // indirect
23+
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
24+
golang.org/x/sys v0.42.0 // indirect
25+
golang.org/x/text v0.34.0 // indirect
1326
)

0 commit comments

Comments
 (0)