feat(bench-swe): add PTerm-based TUI progress component (#43)

aeneasr · claude · web-flow · commit edb13dc5fb5b · 2026-03-17T08:04:03.000Z
Port the lumen TUI progress component to the bench-swe module,
enabling structured terminal output with progress bars, spinners,
and styled status messages. Add pterm and golang.org/x/term
dependencies to bench-swe/go.mod.

Also resolve .gitignore merge conflict preserving all entries.

Co-authored-by: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,4 @@ dist/
 lumen
 
 bench-swe/bench-swe
-bench-swe/overview.txt.claude/worktrees/
+bench-swe/overview.txt
diff --git a/bench-swe/cmd/analyze.go b/bench-swe/cmd/analyze.go
@@ -2,12 +2,14 @@ package cmd
 
 import (
 	"fmt"
+	"os"
 	"path/filepath"
 
 	"github.com/spf13/cobra"
 
 	"github.com/aeneasr/lumen/bench-swe/internal/analysis"
 	"github.com/aeneasr/lumen/bench-swe/internal/task"
+	"github.com/aeneasr/lumen/bench-swe/internal/tui"
 )
 
 var analyzeCmd = &cobra.Command{
@@ -31,6 +33,11 @@ func runAnalyze(_ *cobra.Command, args []string) error {
 		return fmt.Errorf("loading tasks: %w", err)
 	}
 
-	fmt.Printf("Analyzing %d tasks from %s...\n", len(tasks), resultsDir)
-	return analysis.Analyze(resultsDir, benchDir, tasks)
+	p := tui.NewProgress(os.Stderr)
+	p.Info(fmt.Sprintf("Analyzing %d tasks from %s", len(tasks), resultsDir))
+	if err := analysis.Analyze(resultsDir, benchDir, tasks); err != nil {
+		return err
+	}
+	p.Complete("Analysis complete.")
+	return nil
 }
diff --git a/bench-swe/cmd/run.go b/bench-swe/cmd/run.go
@@ -18,6 +18,7 @@ import (
 	"github.com/aeneasr/lumen/bench-swe/internal/report"
 	"github.com/aeneasr/lumen/bench-swe/internal/runner"
 	"github.com/aeneasr/lumen/bench-swe/internal/task"
+	"github.com/aeneasr/lumen/bench-swe/internal/tui"
 )
 
 var (
@@ -58,6 +59,8 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
 	defer cancel()
 
+	p := tui.NewProgress(os.Stderr)
+
 	// Resolve paths
 	benchDir, err := findBenchDir()
 	if err != nil {
@@ -90,20 +93,25 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
 
 	// Preflight
 	if !flagSkipPreflight {
+		p.StartSpinner("Running preflight checks...")
 		pfCfg := &preflight.Config{
 			RepoRoot:    repoRoot,
 			LumenBinary: lumenBinary,
 			Backend:     backend,
 			EmbedModel:  flagEmbedModel,
 			OllamaHost:  os.Getenv("OLLAMA_HOST"),
 		}
-		if err := preflight.Validate(ctx, pfCfg); err != nil {
-			return fmt.Errorf("preflight failed: %w", err)
+		pfErr := preflight.Validate(ctx, pfCfg)
+		p.StopSpinner()
+		if pfErr != nil {
+			return fmt.Errorf("preflight failed: %w", pfErr)
 		}
 	}
 
 	// Load tasks
+	p.StartSpinner("Loading tasks...")
 	tasks, err := task.LoadTasks(tasksDir, flagLanguage)
+	p.StopSpinner()
 	if err != nil {
 		return err
 	}
@@ -122,13 +130,14 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
 	}
 
 	totalRuns := max(flagRuns, 1)
+	total := len(tasks) * len(scenarios) * totalRuns
 
 	if totalRuns > 1 {
-		fmt.Printf("\nRunning %d tasks x %d scenarios x %d runs (parallel=%d)\n\n",
-			len(tasks), len(scenarios), totalRuns, flagParallel)
+		p.Info(fmt.Sprintf("Running %d tasks × %d scenarios × %d runs (parallel=%d)",
+			len(tasks), len(scenarios), totalRuns, flagParallel))
 	} else {
-		fmt.Printf("\nRunning %d tasks x %d scenarios (parallel=%d)\n\n",
-			len(tasks), len(scenarios), flagParallel)
+		p.Info(fmt.Sprintf("Running %d tasks × %d scenarios (parallel=%d)",
+			len(tasks), len(scenarios), flagParallel))
 	}
 
 	// Run tasks
@@ -144,96 +153,123 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
 
 	var mu sync.Mutex
 	var results []runner.RunResult
+	var runRows [][]string
+	completed := 0
+
+	p.Start("Running", total)
 
 	g, gCtx := errgroup.WithContext(ctx)
 	g.SetLimit(flagParallel)
 
 	for _, t := range tasks {
 		g.Go(func() error {
-			var lines []string
 			var taskResults []runner.RunResult
+			var taskRows [][]string
 			for _, s := range scenarios {
 				for run := 1; run <= totalRuns; run++ {
 					result, err := runner.Run(gCtx, runCfg, t, s, run)
-					runLabel := fmt.Sprintf("%-10s", s)
+					runLabel := string(s)
 					if totalRuns > 1 {
-						runLabel = fmt.Sprintf("%-10s run%d", s, run)
+						runLabel = fmt.Sprintf("%s run%d", s, run)
 					}
-					var line string
+					var row []string
 					if err != nil {
-						line = fmt.Sprintf("  %-20s %s ERROR: %v\n", t.ID, runLabel, err)
+						row = []string{t.ID, runLabel, "—", "—", "—", "ERROR: " + err.Error()}
 					} else if result != nil && result.Metrics != nil {
 						m := result.Metrics
-						durS := float64(m.DurationMS) / 1000.0
-						line = fmt.Sprintf("  %-20s %s done  [%5.1fs  $%.4f  in=%d+%dcr  out=%d]\n",
-							t.ID, runLabel, durS, m.CostUSD, m.InputTokens, m.CacheRead, m.OutputTokens)
-					} else if result != nil {
-						line = fmt.Sprintf("  %-20s %s done  (no metrics)\n", t.ID, runLabel)
+						row = []string{
+							t.ID,
+							runLabel,
+							fmt.Sprintf("%.1fs", float64(m.DurationMS)/1000.0),
+							fmt.Sprintf("$%.4f", m.CostUSD),
+							fmt.Sprintf("%d+%dcr/%d", m.InputTokens, m.CacheRead, m.OutputTokens),
+							"done",
+						}
+					} else {
+						row = []string{t.ID, runLabel, "—", "—", "—", "done (no metrics)"}
 					}
-					lines = append(lines, line)
+					taskRows = append(taskRows, row)
 					if result != nil {
 						taskResults = append(taskResults, *result)
 					}
 				}
 			}
 			mu.Lock()
-			defer mu.Unlock()
-			for _, l := range lines {
-				fmt.Print(l)
-			}
+			completed += len(taskRows)
+			p.Update(completed, t.ID)
+			runRows = append(runRows, taskRows...)
 			results = append(results, taskResults...)
+			mu.Unlock()
 			return nil
 		})
 	}
 
 	if err := g.Wait(); err != nil {
+		p.Stop()
 		return err
 	}
+	p.Stop()
+
+	p.PrintTable([]string{"Task", "Scenario", "Time", "Cost", "Tokens (in+cr/out)", "Status"}, runRows)
 
 	// Judge (fresh context so a canceled run phase doesn't block judging)
 	if !flagSkipJudge {
-		fmt.Println("\nJudging results...")
+		p.Info("Judging results...")
 		judgeCtx, judgeCancel := signal.NotifyContext(context.Background(), os.Interrupt)
 		defer judgeCancel()
+
 		var judgeMu sync.Mutex
+		var judgeRows [][]string
+		judgeCompleted := 0
+		judgeTotal := len(tasks) * len(scenarios) * totalRuns
+
+		p.Start("Judging", judgeTotal)
+
 		judgeG, judgeCtx := errgroup.WithContext(judgeCtx)
 		judgeG.SetLimit(flagParallel)
 
 		for _, t := range tasks {
 			judgeG.Go(func() error {
-				var lines []string
+				var taskRows [][]string
 				for _, s := range scenarios {
 					for run := 1; run <= totalRuns; run++ {
 						slug := runner.Slug(t.ID, s, run, totalRuns)
 						result, err := judgeTask(judgeCtx, benchDir, runCfg, t, s, slug)
-						runLabel := fmt.Sprintf("%-10s", s)
+						runLabel := string(s)
 						if totalRuns > 1 {
-							runLabel = fmt.Sprintf("%-10s run%d", s, run)
+							runLabel = fmt.Sprintf("%s run%d", s, run)
 						}
-						var line string
+						var row []string
 						if err != nil {
-							line = fmt.Sprintf("  %-20s %s error: %v\n", t.ID, runLabel, err)
+							row = []string{t.ID, runLabel, "ERROR: " + err.Error()}
 						} else if result != nil {
-							line = fmt.Sprintf("  %-20s %s %s\n", t.ID, runLabel, result.Rating)
+							row = []string{t.ID, runLabel, string(result.Rating)}
+						} else {
+							row = []string{t.ID, runLabel, "—"}
 						}
-						lines = append(lines, line)
+						taskRows = append(taskRows, row)
 					}
 				}
 				judgeMu.Lock()
-				defer judgeMu.Unlock()
-				for _, l := range lines {
-					fmt.Print(l)
-				}
+				judgeCompleted += len(taskRows)
+				p.Update(judgeCompleted, t.ID)
+				judgeRows = append(judgeRows, taskRows...)
+				judgeMu.Unlock()
 				return nil
 			})
 		}
+
 		if err := judgeG.Wait(); err != nil {
-			fmt.Printf("  Judge error: %v\n", err)
+			p.Stop()
+			p.Error(fmt.Sprintf("Judge error: %v", err))
+		} else {
+			p.Stop()
 		}
+		p.PrintTable([]string{"Task", "Scenario", "Rating"}, judgeRows)
 	}
 
 	// Reports
-	fmt.Println("\nGenerating reports...")
+	p.Info("Generating reports...")
 	rptCfg := &report.Config{
 		ResultsDir:  resultsDir,
 		EmbedModel:  flagEmbedModel,
@@ -251,7 +287,7 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	fmt.Printf("\nResults: %s\n", resultsDir)
+	p.Complete("Results: " + resultsDir)
 	return nil
 }
 
diff --git a/bench-swe/cmd/validate.go b/bench-swe/cmd/validate.go
@@ -8,6 +8,7 @@ import (
 	"github.com/spf13/cobra"
 
 	"github.com/aeneasr/lumen/bench-swe/internal/task"
+	"github.com/aeneasr/lumen/bench-swe/internal/tui"
 )
 
 const grepScoreThreshold = 0.5
@@ -41,11 +42,15 @@ func runValidate(_ *cobra.Command, args []string) error {
 		return err
 	}
 
+	p := tui.NewProgress(os.Stderr)
+
 	tasks, err := task.LoadTasks(tasksDir, nil)
 	if err != nil {
 		return err
 	}
 
+	p.Info(fmt.Sprintf("Validating %d tasks from %s", len(tasks), tasksDir))
+
 	type result struct {
 		t      task.Task
 		score  float64
@@ -79,27 +84,28 @@ func runValidate(_ *cobra.Command, args []string) error {
 	var failed int
 	for _, r := range results {
 		if r.err != "" {
-			fmt.Printf("ERROR  %s: %s\n", r.t.ID, r.err)
+			p.Error(fmt.Sprintf("%s: %s", r.t.ID, r.err))
 			failed++
 			continue
 		}
-		label := "OK    "
-		if r.score >= grepScoreThreshold {
-			label = "REJECT"
-			failed++
-		} else if r.score > 0 {
-			label = "WARN  "
-		}
-		fmt.Printf("%s %s  grep_score=%.0f%%", label, r.t.ID, r.score*100)
+		msg := fmt.Sprintf("%s  grep_score=%.0f%%", r.t.ID, r.score*100)
 		if len(r.leaked) > 0 {
-			fmt.Printf("  leaked=%v", r.leaked)
+			msg += fmt.Sprintf("  leaked=%v", r.leaked)
+		}
+		switch {
+		case r.score >= grepScoreThreshold:
+			p.Error("REJECT " + msg)
+			failed++
+		case r.score > 0:
+			p.Warn("WARN   " + msg)
+		default:
+			p.Info("OK     " + msg)
 		}
-		fmt.Println()
 	}
 
 	if failed > 0 {
 		return fmt.Errorf("%d task(s) failed validation", failed)
 	}
-	fmt.Printf("\nAll %d tasks passed.\n", len(results))
+	p.Complete(fmt.Sprintf("All %d tasks passed.", len(results)))
 	return nil
 }
diff --git a/bench-swe/go.mod b/bench-swe/go.mod
@@ -1,13 +1,26 @@
 module github.com/aeneasr/lumen/bench-swe
 
-go 1.23.0
+go 1.25.0
 
 require (
+	github.com/pterm/pterm v0.12.83
 	github.com/spf13/cobra v1.9.1
-	golang.org/x/sync v0.12.0
+	golang.org/x/sync v0.19.0
+	golang.org/x/term v0.41.0
 )
 
 require (
+	atomicgo.dev/cursor v0.2.0 // indirect
+	atomicgo.dev/keyboard v0.2.9 // indirect
+	atomicgo.dev/schedule v0.1.0 // indirect
+	github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
+	github.com/containerd/console v1.0.5 // indirect
+	github.com/gookit/color v1.6.0 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
+	github.com/lithammer/fuzzysearch v1.1.8 // indirect
+	github.com/mattn/go-runewidth v0.0.20 // indirect
 	github.com/spf13/pflag v1.0.6 // indirect
+	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
+	golang.org/x/sys v0.42.0 // indirect
+	golang.org/x/text v0.34.0 // indirect
 )
diff --git a/bench-swe/go.sum b/bench-swe/go.sum
diff --git a/bench-swe/internal/tui/progress.go b/bench-swe/internal/tui/progress.go