feat(op-acceptor): add timing-based bin-packing for CI test splitting

scharissis · scharissis · commit 656450a11c29 · 2026-02-17T17:23:36.000+01:00
Add --split-timing-file and --split-timing-output flags to enable
balanced CI splitting using a greedy LPT (Longest Processing Time
first) algorithm instead of naive round-robin.

When a timing hints JSON file is provided, ApplySplitFilter distributes
work items across nodes to minimize makespan (slowest node duration).
Without timing data, the existing round-robin fallback is preserved.

After each run, timing data can be written to a file for caching and
reuse on subsequent CI runs. The report-from-events mode also supports
extracting timing data from merged test events.
diff --git a/op-acceptor/config.go b/op-acceptor/config.go
@@ -40,6 +40,8 @@ type Config struct {
 	ReportFromEvents     string                 // Path to raw events file for report-only mode (empty = normal mode)
 	SplitTotal           int                    // Total split nodes for CI parallelism (0 = no splitting)
 	SplitIndex           int                    // This node's index (0-based) for CI parallelism
+	SplitTimingFile      string                 // Path to JSON timing hints for balanced CI splitting
+	SplitTimingOutput    string                 // Path to write updated timing data after test execution
 	Log                  log.Logger
 	ExcludeGates         []string // List of gate IDs whose tests should be excluded
 }
@@ -166,6 +168,8 @@ func NewConfig(ctx *cli.Context, log log.Logger, testDir string, validatorConfig
 		ReportFromEvents:     ctx.String(flags.ReportFromEvents.Name),
 		SplitTotal:           splitTotal,
 		SplitIndex:           splitIndex,
+		SplitTimingFile:      ctx.String(flags.SplitTimingFile.Name),
+		SplitTimingOutput:    ctx.String(flags.SplitTimingOutput.Name),
 		LogDir:               logDir,
 		Log:                  log,
 		ExcludeGates:         excludeGates,
diff --git a/op-acceptor/flags/flags.go b/op-acceptor/flags/flags.go
@@ -203,6 +203,18 @@ var (
 		EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "SPLIT_INDEX"),
 		Usage:   "Index of this node (0-based) for CI parallelism.",
 	}
+	SplitTimingFile = &cli.StringFlag{
+		Name:    "split-timing-file",
+		Value:   "",
+		EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "SPLIT_TIMING_FILE"),
+		Usage:   "Path to JSON file with package timing hints for balanced CI splitting.",
+	}
+	SplitTimingOutput = &cli.StringFlag{
+		Name:    "split-timing-output",
+		Value:   "",
+		EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "SPLIT_TIMING_OUTPUT"),
+		Usage:   "Path to write updated timing data after test execution (for caching).",
+	}
 )
 
 var requiredFlags = []cli.Flag{
@@ -233,6 +245,8 @@ var optionalFlags = []cli.Flag{
 	ReportFromEvents,
 	SplitTotal,
 	SplitIndex,
+	SplitTimingFile,
+	SplitTimingOutput,
 }
 var Flags []cli.Flag
 
diff --git a/op-acceptor/nat.go b/op-acceptor/nat.go
@@ -199,6 +199,7 @@ func New(ctx context.Context, config *Config, version string, shutdownCallback f
 		ProgressInterval:   config.ProgressInterval,
 		SplitTotal:         config.SplitTotal,
 		SplitIndex:         config.SplitIndex,
+		SplitTimingFile:    config.SplitTimingFile,
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to create test runner: %w", err)
@@ -550,6 +551,18 @@ func (n *nat) runTests(ctx context.Context) error {
 		}
 	}
 
+	// Write timing output if configured (for CI caching)
+	if n.config.SplitTimingOutput != "" && !n.config.FlakeShake && n.result != nil {
+		timingData := n.extractTimingData(n.result)
+		if err := runner.WriteTimingFile(n.config.SplitTimingOutput, timingData); err != nil {
+			n.config.Log.Error("Failed to write timing output", "path", n.config.SplitTimingOutput, "error", err)
+		} else {
+			n.config.Log.Info("Wrote timing output for CI caching",
+				"path", n.config.SplitTimingOutput,
+				"packages", len(timingData))
+		}
+	}
+
 	// We should have the same runID from the test run result (skip for flake-shake mode)
 	if !n.config.FlakeShake && n.result.RunID != runID {
 		n.config.Log.Warn("RunID from result doesn't match expected runID",
@@ -1055,7 +1068,12 @@ func (n *nat) dryRun(ctx context.Context) error {
 				})
 			}
 		}
-		filtered := runner.ApplySplitFilter(allWork, n.config.SplitTotal, n.config.SplitIndex)
+		timings, err := runner.LoadTimingFile(n.config.SplitTimingFile)
+		if err != nil {
+			n.config.Log.Warn("Failed to load timing file, falling back to round-robin",
+				"path", n.config.SplitTimingFile, "error", err)
+		}
+		filtered := runner.ApplySplitFilter(allWork, n.config.SplitTotal, n.config.SplitIndex, timings)
 
 		// Rebuild gateValidators from the filtered work items
 		gateValidators = make(map[string][]types.ValidatorMetadata)
@@ -1065,7 +1083,8 @@ func (n *nat) dryRun(ctx context.Context) error {
 		n.config.Log.Info("DRY RUN: Applied CI split filter",
 			"splitTotal", n.config.SplitTotal,
 			"splitIndex", n.config.SplitIndex,
-			"workItems", len(filtered))
+			"workItems", len(filtered),
+			"timingBased", len(timings) > 0)
 	}
 
 	t := table.NewWriter()
@@ -1204,9 +1223,52 @@ func (n *nat) reportFromEvents() error {
 	logDir, _ := fileLogger.GetDirectoryForRunID(runID)
 	n.config.Log.Info("Consolidated report generated", "path", logDir)
 
+	// Write timing output if configured (for CI caching)
+	if n.config.SplitTimingOutput != "" {
+		timingData := extractTimingDataFromParsedResults(results)
+		if err := runner.WriteTimingFile(n.config.SplitTimingOutput, timingData); err != nil {
+			n.config.Log.Error("Failed to write timing output from events", "error", err)
+		} else {
+			n.config.Log.Info("Wrote timing output from events",
+				"path", n.config.SplitTimingOutput,
+				"packages", len(timingData))
+		}
+	}
+
 	return nil
 }
 
+// extractTimingData builds a TimingKey → duration_seconds map from test results.
+// This data can be cached and used for timing-based CI splitting on subsequent runs.
+func (n *nat) extractTimingData(result *runner.RunnerResult) map[string]float64 {
+	timings := make(map[string]float64)
+	for gateName, gate := range result.Gates {
+		for _, test := range gate.Tests {
+			key := runner.TimingKey(gateName, test.Metadata.Package, test.Metadata.FuncName)
+			timings[key] = test.Duration.Seconds()
+		}
+		for _, suite := range gate.Suites {
+			for _, test := range suite.Tests {
+				key := runner.TimingKey(gateName, test.Metadata.Package, test.Metadata.FuncName)
+				timings[key] = test.Duration.Seconds()
+			}
+		}
+	}
+	return timings
+}
+
+// extractTimingDataFromParsedResults builds a TimingKey → duration_seconds map
+// from parsed test results (used in report-from-events mode).
+func extractTimingDataFromParsedResults(results []*types.TestResult) map[string]float64 {
+	timings := make(map[string]float64)
+	for _, result := range results {
+		// In gateless/report mode, use "gateless" as the gate ID
+		key := runner.TimingKey("gateless", result.Metadata.Package, result.Metadata.FuncName)
+		timings[key] = result.Duration.Seconds()
+	}
+	return timings
+}
+
 // WaitForShutdown waits for all goroutines to finish
 func (n *nat) WaitForShutdown(ctx context.Context) error {
 	timeout := time.NewTimer(time.Second * 5)
diff --git a/op-acceptor/runner/parallel.go b/op-acceptor/runner/parallel.go
@@ -2,7 +2,9 @@ package runner
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
+	"os"
 	"sort"
 	"sync"
 	"time"
@@ -289,31 +291,46 @@ func (r *runner) collectTestWork() []TestWork {
 
 	// Apply CI split filtering if configured
 	if r.splitTotal > 0 {
-		workItems = ApplySplitFilter(workItems, r.splitTotal, r.splitIndex)
+		timings, err := LoadTimingFile(r.splitTimingFile)
+		if err != nil {
+			r.log.Warn("Failed to load timing file, falling back to round-robin",
+				"path", r.splitTimingFile, "error", err)
+		}
+		workItems = ApplySplitFilter(workItems, r.splitTotal, r.splitIndex, timings)
 		r.log.Info("Applied CI split filter",
 			"splitTotal", r.splitTotal,
 			"splitIndex", r.splitIndex,
-			"workItems", len(workItems))
+			"workItems", len(workItems),
+			"timingBased", len(timings) > 0)
 	}
 
 	return workItems
 }
 
-// splitKey returns a deterministic key for sorting work items during CI split filtering.
-// Includes GateID so that the same package appearing under different gates (via inheritance)
-// gets a stable, distinct position in the sort order.
+// TimingKey builds the canonical key used for timing-based CI splitting.
+// The format is "gate|package|funcName", which uniquely identifies a test
+// work item across gates (the same package under different gates via
+// inheritance gets a distinct key).
+func TimingKey(gate, pkg, funcName string) string {
+	return gate + "|" + pkg + "|" + funcName
+}
+
+// splitKey returns the timing key for a TestWork item.
 func splitKey(w TestWork) string {
-	return w.GateID + "|" + w.Validator.Package + "|" + w.Validator.FuncName
+	return TimingKey(w.GateID, w.Validator.Package, w.Validator.FuncName)
 }
 
-// ApplySplitFilter sorts work items deterministically and returns only those assigned
-// to the given split index. Items are distributed round-robin: item i is assigned to
-// node i % total.
-func ApplySplitFilter(items []TestWork, total, index int) []TestWork {
+// ApplySplitFilter distributes work items across split nodes. When timings are
+// provided, it uses a greedy bin-packing (LPT) algorithm for balanced splits.
+// Otherwise it falls back to deterministic round-robin by sorted key.
+func ApplySplitFilter(items []TestWork, total, index int, timings map[string]float64) []TestWork {
+	if len(timings) > 0 {
+		return applySplitByTiming(items, total, index, timings)
+	}
+	// Existing round-robin fallback
 	sort.Slice(items, func(i, j int) bool {
 		return splitKey(items[i]) < splitKey(items[j])
 	})
-
 	var filtered []TestWork
 	for i, item := range items {
 		if i%total == index {
@@ -323,6 +340,100 @@ func ApplySplitFilter(items []TestWork, total, index int) []TestWork {
 	return filtered
 }
 
+// applySplitByTiming uses the Longest Processing Time first (LPT) greedy
+// bin-packing algorithm to distribute work items across nodes, minimizing
+// the makespan (duration of the slowest node).
+func applySplitByTiming(items []TestWork, total, index int, timings map[string]float64) []TestWork {
+	defaultDuration := medianTiming(timings)
+
+	// Build a duration lookup for each item
+	duration := func(w TestWork) float64 {
+		if d, ok := timings[splitKey(w)]; ok {
+			return d
+		}
+		return defaultDuration
+	}
+
+	// Sort by duration descending (heaviest first -- standard LPT),
+	// with tie-break by key for determinism.
+	sort.Slice(items, func(i, j int) bool {
+		di, dj := duration(items[i]), duration(items[j])
+		if di != dj {
+			return di > dj
+		}
+		return splitKey(items[i]) < splitKey(items[j])
+	})
+
+	// Greedy assignment: assign each item to the node with the lowest total
+	nodeTotals := make([]float64, total)
+	nodeItems := make([][]TestWork, total)
+	for _, item := range items {
+		minNode := 0
+		for n := 1; n < total; n++ {
+			if nodeTotals[n] < nodeTotals[minNode] {
+				minNode = n
+			}
+		}
+		nodeItems[minNode] = append(nodeItems[minNode], item)
+		nodeTotals[minNode] += duration(item)
+	}
+
+	return nodeItems[index]
+}
+
+// medianTiming returns the median of the timing values. If the map is empty,
+// returns 60.0 as a sensible default for unknown test durations.
+func medianTiming(timings map[string]float64) float64 {
+	if len(timings) == 0 {
+		return 60.0
+	}
+	vals := make([]float64, 0, len(timings))
+	for _, v := range timings {
+		vals = append(vals, v)
+	}
+	sort.Float64s(vals)
+	mid := len(vals) / 2
+	if len(vals)%2 == 0 {
+		return (vals[mid-1] + vals[mid]) / 2
+	}
+	return vals[mid]
+}
+
+// LoadTimingFile reads a JSON file mapping TimingKey → duration_seconds.
+// Returns nil map (not error) if the path is empty or the file doesn't exist.
+func LoadTimingFile(path string) (map[string]float64, error) {
+	if path == "" {
+		return nil, nil
+	}
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("reading timing file: %w", err)
+	}
+	var timings map[string]float64
+	if err := json.Unmarshal(data, &timings); err != nil {
+		return nil, fmt.Errorf("parsing timing file: %w", err)
+	}
+	return timings, nil
+}
+
+// WriteTimingFile writes a JSON map of TimingKey → duration_seconds.
+func WriteTimingFile(path string, timings map[string]float64) error {
+	if path == "" {
+		return nil
+	}
+	data, err := json.MarshalIndent(timings, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshalling timing data: %w", err)
+	}
+	if err := os.WriteFile(path, data, 0644); err != nil {
+		return fmt.Errorf("writing timing file: %w", err)
+	}
+	return nil
+}
+
 // initializeProgressTracking sets up data structures to concurrently
 // track progress for each gate and suite in the scheduled work items
 func (pe *ParallelExecutor) initializeProgressTracking(workItems []TestWork) {
diff --git a/op-acceptor/runner/parallel_test.go b/op-acceptor/runner/parallel_test.go
diff --git a/op-acceptor/runner/runner.go b/op-acceptor/runner/runner.go