Skip to content

Commit 7d273b3

Browse files
authored
fix(op-acceptor): ensure logs are written to disk for timeout cases (#540)
When tests timeout and CI kills the process (SIGKILL, exit code 137), buffered writes in AsyncFile were being lost because the background goroutine never had a chance to flush them. This adds: - AsyncFile.Flush() to block until queued writes are synced to disk - FileLogger.FlushAll() to flush all async writers - FileLogger.LogTestResultSync() for synchronous logging of timeout results - Graceful shutdown flush in nat.Stop() to persist logs before exit This ensures per-test log files exist in logs/testrun-*/failed/ even when tests are terminated due to timeout.
1 parent dc45976 commit 7d273b3

File tree

3 files changed

+56
-1
lines changed

3 files changed

+56
-1
lines changed

op-acceptor/logging/filelogger.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,30 @@ func (af *AsyncFile) Close() error {
129129
return af.file.Close()
130130
}
131131

132+
// Flush blocks until all queued writes are written to disk and synced.
133+
// It sends an empty marker to the queue and waits for the queue to drain,
134+
// ensuring all previously queued writes have been processed.
135+
func (af *AsyncFile) Flush() error {
136+
af.mu.Lock()
137+
if af.stopped {
138+
af.mu.Unlock()
139+
return nil
140+
}
141+
af.mu.Unlock()
142+
143+
// Send an empty marker to the queue. Once the queue drains to empty,
144+
// we know all writes (including the marker) have been processed.
145+
af.queue <- []byte{}
146+
147+
// Wait for the queue to drain. This spin-wait is acceptable for flush
148+
// operations which only occur during shutdown or timeout handling.
149+
for len(af.queue) > 0 {
150+
time.Sleep(time.Millisecond)
151+
}
152+
153+
return af.file.Sync()
154+
}
155+
132156
// NewFileLogger creates a new FileLogger with given configuration
133157
func NewFileLogger(baseDir string, runID string, networkName string, gateRuns []string) (*FileLogger, error) {
134158
if runID == "" {
@@ -246,6 +270,20 @@ func (l *FileLogger) closeAllWriters() {
246270
l.asyncWriters = make(map[string]*AsyncFile)
247271
}
248272

273+
// FlushAll flushes all async writers to disk without closing them
274+
func (l *FileLogger) FlushAll() error {
275+
l.mu.Lock()
276+
defer l.mu.Unlock()
277+
278+
var firstErr error
279+
for _, writer := range l.asyncWriters {
280+
if err := writer.Flush(); err != nil && firstErr == nil {
281+
firstErr = err
282+
}
283+
}
284+
return firstErr
285+
}
286+
249287
// GetDirectoryForRunID returns the path for a specific runID
250288
// The runID must be provided, otherwise an error is returned
251289
func (l *FileLogger) GetDirectoryForRunID(runID string) (string, error) {
@@ -277,6 +315,15 @@ func (l *FileLogger) LogTestResult(result *types.TestResult, runID string) error
277315
return nil
278316
}
279317

318+
// LogTestResultSync logs a test result and flushes immediately to disk.
319+
// Use this for timeout cases to ensure logs are written before process termination.
320+
func (l *FileLogger) LogTestResultSync(result *types.TestResult, runID string) error {
321+
if err := l.LogTestResult(result, runID); err != nil {
322+
return err
323+
}
324+
return l.FlushAll()
325+
}
326+
280327
// LogSummary writes a summary of the test run to a file
281328
// The runID must be provided, otherwise an error is returned
282329
func (l *FileLogger) LogSummary(summary string, runID string) error {

op-acceptor/nat.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,13 @@ func (n *nat) writeRunArtifacts(runID string) error {
784784
func (n *nat) Stop(ctx context.Context) error {
785785
n.config.Log.Info("Stopping op-acceptor")
786786

787+
// Flush logs before shutdown to ensure all buffered writes are persisted
788+
if n.fileLogger != nil {
789+
if err := n.fileLogger.FlushAll(); err != nil {
790+
n.config.Log.Error("Error flushing logs during shutdown", "error", err)
791+
}
792+
}
793+
787794
// Check if we're already stopped
788795
if !n.running.Load() {
789796
n.config.Log.Debug("Service already stopped, nothing to do")

op-acceptor/runner/runner.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -976,8 +976,9 @@ func (r *runner) runSingleTest(ctx context.Context, metadata types.ValidatorMeta
976976
}
977977

978978
// Force logging of timeout result to ensure it's captured
979+
// Use synchronous logging to ensure writes are flushed to disk before potential process termination
979980
if r.fileLogger != nil {
980-
if logErr := r.fileLogger.LogTestResult(parsed, r.runID); logErr != nil {
981+
if logErr := r.fileLogger.LogTestResultSync(parsed, r.runID); logErr != nil {
981982
r.log.Error("Failed to log timeout result", "error", logErr, "test", metadata.FuncName)
982983
}
983984
}

0 commit comments

Comments
 (0)