@@ -18,6 +18,7 @@ import (
1818 "github.com/aeneasr/lumen/bench-swe/internal/report"
1919 "github.com/aeneasr/lumen/bench-swe/internal/runner"
2020 "github.com/aeneasr/lumen/bench-swe/internal/task"
21+ "github.com/aeneasr/lumen/bench-swe/internal/tui"
2122)
2223
2324var (
@@ -58,6 +59,8 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
5859 ctx , cancel := signal .NotifyContext (context .Background (), os .Interrupt )
5960 defer cancel ()
6061
62+ p := tui .NewProgress (os .Stderr )
63+
6164 // Resolve paths
6265 benchDir , err := findBenchDir ()
6366 if err != nil {
@@ -90,20 +93,25 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
9093
9194 // Preflight
9295 if ! flagSkipPreflight {
96+ p .StartSpinner ("Running preflight checks..." )
9397 pfCfg := & preflight.Config {
9498 RepoRoot : repoRoot ,
9599 LumenBinary : lumenBinary ,
96100 Backend : backend ,
97101 EmbedModel : flagEmbedModel ,
98102 OllamaHost : os .Getenv ("OLLAMA_HOST" ),
99103 }
100- if err := preflight .Validate (ctx , pfCfg ); err != nil {
101- return fmt .Errorf ("preflight failed: %w" , err )
104+ pfErr := preflight .Validate (ctx , pfCfg )
105+ p .StopSpinner ()
106+ if pfErr != nil {
107+ return fmt .Errorf ("preflight failed: %w" , pfErr )
102108 }
103109 }
104110
105111 // Load tasks
112+ p .StartSpinner ("Loading tasks..." )
106113 tasks , err := task .LoadTasks (tasksDir , flagLanguage )
114+ p .StopSpinner ()
107115 if err != nil {
108116 return err
109117 }
@@ -122,13 +130,14 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
122130 }
123131
124132 totalRuns := max (flagRuns , 1 )
133+ total := len (tasks ) * len (scenarios ) * totalRuns
125134
126135 if totalRuns > 1 {
127- fmt .Printf ( " \n Running %d tasks x %d scenarios x %d runs (parallel=%d)\n \n " ,
128- len (tasks ), len (scenarios ), totalRuns , flagParallel )
136+ p . Info ( fmt .Sprintf ( "Running %d tasks × %d scenarios × %d runs (parallel=%d)" ,
137+ len (tasks ), len (scenarios ), totalRuns , flagParallel ))
129138 } else {
130- fmt .Printf ( " \n Running %d tasks x %d scenarios (parallel=%d)\n \n " ,
131- len (tasks ), len (scenarios ), flagParallel )
139+ p . Info ( fmt .Sprintf ( "Running %d tasks × %d scenarios (parallel=%d)" ,
140+ len (tasks ), len (scenarios ), flagParallel ))
132141 }
133142
134143 // Run tasks
@@ -144,96 +153,123 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
144153
145154 var mu sync.Mutex
146155 var results []runner.RunResult
156+ var runRows [][]string
157+ completed := 0
158+
159+ p .Start ("Running" , total )
147160
148161 g , gCtx := errgroup .WithContext (ctx )
149162 g .SetLimit (flagParallel )
150163
151164 for _ , t := range tasks {
152165 g .Go (func () error {
153- var lines []string
154166 var taskResults []runner.RunResult
167+ var taskRows [][]string
155168 for _ , s := range scenarios {
156169 for run := 1 ; run <= totalRuns ; run ++ {
157170 result , err := runner .Run (gCtx , runCfg , t , s , run )
158- runLabel := fmt . Sprintf ( "%-10s" , s )
171+ runLabel := string ( s )
159172 if totalRuns > 1 {
160- runLabel = fmt .Sprintf ("%-10s run%d" , s , run )
173+ runLabel = fmt .Sprintf ("%s run%d" , s , run )
161174 }
162- var line string
175+ var row [] string
163176 if err != nil {
164- line = fmt . Sprintf ( " %-20s %s ERROR: %v \n " , t . ID , runLabel , err )
177+ row = [] string { t . ID , runLabel , "—" , "—" , "—" , " ERROR: " + err . Error ()}
165178 } else if result != nil && result .Metrics != nil {
166179 m := result .Metrics
167- durS := float64 (m .DurationMS ) / 1000.0
168- line = fmt .Sprintf (" %-20s %s done [%5.1fs $%.4f in=%d+%dcr out=%d]\n " ,
169- t .ID , runLabel , durS , m .CostUSD , m .InputTokens , m .CacheRead , m .OutputTokens )
170- } else if result != nil {
171- line = fmt .Sprintf (" %-20s %s done (no metrics)\n " , t .ID , runLabel )
180+ row = []string {
181+ t .ID ,
182+ runLabel ,
183+ fmt .Sprintf ("%.1fs" , float64 (m .DurationMS )/ 1000.0 ),
184+ fmt .Sprintf ("$%.4f" , m .CostUSD ),
185+ fmt .Sprintf ("%d+%dcr/%d" , m .InputTokens , m .CacheRead , m .OutputTokens ),
186+ "done" ,
187+ }
188+ } else {
189+ row = []string {t .ID , runLabel , "—" , "—" , "—" , "done (no metrics)" }
172190 }
173- lines = append (lines , line )
191+ taskRows = append (taskRows , row )
174192 if result != nil {
175193 taskResults = append (taskResults , * result )
176194 }
177195 }
178196 }
179197 mu .Lock ()
180- defer mu .Unlock ()
181- for _ , l := range lines {
182- fmt .Print (l )
183- }
198+ completed += len (taskRows )
199+ p .Update (completed , t .ID )
200+ runRows = append (runRows , taskRows ... )
184201 results = append (results , taskResults ... )
202+ mu .Unlock ()
185203 return nil
186204 })
187205 }
188206
189207 if err := g .Wait (); err != nil {
208+ p .Stop ()
190209 return err
191210 }
211+ p .Stop ()
212+
213+ p .PrintTable ([]string {"Task" , "Scenario" , "Time" , "Cost" , "Tokens (in+cr/out)" , "Status" }, runRows )
192214
193215 // Judge (fresh context so a canceled run phase doesn't block judging)
194216 if ! flagSkipJudge {
195- fmt . Println ( " \n Judging results..." )
217+ p . Info ( "Judging results..." )
196218 judgeCtx , judgeCancel := signal .NotifyContext (context .Background (), os .Interrupt )
197219 defer judgeCancel ()
220+
198221 var judgeMu sync.Mutex
222+ var judgeRows [][]string
223+ judgeCompleted := 0
224+ judgeTotal := len (tasks ) * len (scenarios ) * totalRuns
225+
226+ p .Start ("Judging" , judgeTotal )
227+
199228 judgeG , judgeCtx := errgroup .WithContext (judgeCtx )
200229 judgeG .SetLimit (flagParallel )
201230
202231 for _ , t := range tasks {
203232 judgeG .Go (func () error {
204- var lines []string
233+ var taskRows [] []string
205234 for _ , s := range scenarios {
206235 for run := 1 ; run <= totalRuns ; run ++ {
207236 slug := runner .Slug (t .ID , s , run , totalRuns )
208237 result , err := judgeTask (judgeCtx , benchDir , runCfg , t , s , slug )
209- runLabel := fmt . Sprintf ( "%-10s" , s )
238+ runLabel := string ( s )
210239 if totalRuns > 1 {
211- runLabel = fmt .Sprintf ("%-10s run%d" , s , run )
240+ runLabel = fmt .Sprintf ("%s run%d" , s , run )
212241 }
213- var line string
242+ var row [] string
214243 if err != nil {
215- line = fmt . Sprintf ( " %-20s %s error: %v \n " , t .ID , runLabel , err )
244+ row = [] string { t .ID , runLabel , "ERROR: " + err . Error ()}
216245 } else if result != nil {
217- line = fmt .Sprintf (" %-20s %s %s\n " , t .ID , runLabel , result .Rating )
246+ row = []string {t .ID , runLabel , string (result .Rating )}
247+ } else {
248+ row = []string {t .ID , runLabel , "—" }
218249 }
219- lines = append (lines , line )
250+ taskRows = append (taskRows , row )
220251 }
221252 }
222253 judgeMu .Lock ()
223- defer judgeMu . Unlock ( )
224- for _ , l := range lines {
225- fmt . Print ( l )
226- }
254+ judgeCompleted += len ( taskRows )
255+ p . Update ( judgeCompleted , t . ID )
256+ judgeRows = append ( judgeRows , taskRows ... )
257+ judgeMu . Unlock ()
227258 return nil
228259 })
229260 }
261+
230262 if err := judgeG .Wait (); err != nil {
231- fmt .Printf (" Judge error: %v\n " , err )
263+ p .Stop ()
264+ p .Error (fmt .Sprintf ("Judge error: %v" , err ))
265+ } else {
266+ p .Stop ()
232267 }
268+ p .PrintTable ([]string {"Task" , "Scenario" , "Rating" }, judgeRows )
233269 }
234270
235271 // Reports
236- fmt . Println ( " \n Generating reports..." )
272+ p . Info ( "Generating reports..." )
237273 rptCfg := & report.Config {
238274 ResultsDir : resultsDir ,
239275 EmbedModel : flagEmbedModel ,
@@ -251,7 +287,7 @@ func runBenchmarks(cmd *cobra.Command, args []string) error {
251287 return err
252288 }
253289
254- fmt . Printf ( " \n Results: %s \n " , resultsDir )
290+ p . Complete ( "Results: " + resultsDir )
255291 return nil
256292}
257293
0 commit comments