fix: agent message flickering (#12)

hugodutka · web-flow · commit 6ac5d166bdd2 · 2025-05-27T14:04:30.000+02:00
* fix: agent message flickering
diff --git a/lib/screentracker/conversation.go b/lib/screentracker/conversation.go
@@ -117,7 +117,17 @@ func (c *Conversation) StartSnapshotLoop(ctx context.Context) {
 			case <-ctx.Done():
 				return
 			case <-time.After(c.cfg.SnapshotInterval):
-				c.AddSnapshot(c.cfg.AgentIO.ReadScreen())
+				// It's important that we hold the lock while reading the screen.
+				// There's a race condition that occurs without it:
+				// 1. The screen is read
+				// 2. Independently, SendMessage is called and takes the lock.
+				// 3. AddSnapshot is called and waits on the lock.
+				// 4. SendMessage modifies the terminal state, releases the lock
+				// 5. AddSnapshot adds a snapshot from a stale screen
+				c.lock.Lock()
+				screen := c.cfg.AgentIO.ReadScreen()
+				c.addSnapshotInner(screen)
+				c.lock.Unlock()
 			}
 		}
 	}()
@@ -191,32 +201,21 @@ func (c *Conversation) updateLastAgentMessage(screen string, timestamp time.Time
 	c.messages[len(c.messages)-1].Id = len(c.messages) - 1
 }
 
-// This is a temporary hack to work around a bug in Claude Code 0.2.70.
-// https://github.com/anthropics/claude-code/issues/803
-// 0.2.71 should not need it anymore. We will remove it a couple of days
-// after the new version is released.
-func removeDuplicateClaude_0_2_70_Output(screen string) string {
-	// this hack will only work if the terminal emulator is exactly 80 characters wide
-	// this is hard-coded right now in the termexec package
-	idx := strings.LastIndex(screen, "╭────────────────────────────────────────────╮                                  \n│ ✻ Welcome to Claude Code research preview! │")
-	if idx == -1 {
-		return screen
+// assumes the caller holds the lock
+func (c *Conversation) addSnapshotInner(screen string) {
+	snapshot := screenSnapshot{
+		timestamp: c.cfg.GetTime(),
+		screen:    screen,
 	}
-	return screen[idx:]
+	c.snapshotBuffer.Add(snapshot)
+	c.updateLastAgentMessage(screen, snapshot.timestamp)
 }
 
 func (c *Conversation) AddSnapshot(screen string) {
 	c.lock.Lock()
 	defer c.lock.Unlock()
 
-	screen = removeDuplicateClaude_0_2_70_Output(screen)
-
-	snapshot := screenSnapshot{
-		timestamp: c.cfg.GetTime(),
-		screen:    screen,
-	}
-	c.snapshotBuffer.Add(snapshot)
-	c.updateLastAgentMessage(screen, snapshot.timestamp)
+	c.addSnapshotInner(screen)
 }
 
 type MessagePart interface {
diff --git a/lib/termexec/termexec.go b/lib/termexec/termexec.go
@@ -7,17 +7,21 @@ import (
 	"log/slog"
 	"os"
 	"os/exec"
+	"sync"
 	"syscall"
 	"time"
 
 	"github.com/ActiveState/termtest/xpty"
 	"github.com/coder/agentapi/lib/logctx"
+	"github.com/coder/agentapi/lib/util"
 	"golang.org/x/xerrors"
 )
 
 type Process struct {
-	xp      *xpty.Xpty
-	execCmd *exec.Cmd
+	xp               *xpty.Xpty
+	execCmd          *exec.Cmd
+	screenUpdateLock sync.RWMutex
+	lastScreenUpdate time.Time
 }
 
 type StartProcessConfig struct {
@@ -42,11 +46,38 @@ func StartProcess(ctx context.Context, args StartProcessConfig) (*Process, error
 		return nil, err
 	}
 
+	process := &Process{xp: xp, execCmd: execCmd}
+
 	go func() {
+		// HACK: Working around xpty concurrency limitations
+		//
+		// Problem:
+		// 1. We need to track when the terminal screen was last updated (for ReadScreen)
+		// 2. xpty only updates terminal state through xp.ReadRune()
+		// 3. xp.ReadRune() has a bug - it panics when SetReadDeadline is used
+		// 4. Without deadlines, ReadRune blocks until the process outputs data
+		//
+		// Why this matters:
+		// If we wrapped ReadRune + lastScreenUpdate in a mutex, this goroutine would
+		// hold the lock while waiting for process output. Since ReadRune blocks indefinitely,
+		// ReadScreen callers would be locked out until new output arrives. Even worse,
+		// after output arrives, this goroutine could immediately reacquire the lock
+		// for the next ReadRune call, potentially starving ReadScreen callers indefinitely.
+		//
+		// Solution:
+		// Instead of using xp.ReadRune(), we directly use its internal components:
+		// - pp.ReadRune() - handles the blocking read from the process
+		// - xp.Term.WriteRune() - updates the terminal state
+		//
+		// This lets us apply the mutex only around the terminal update and timestamp,
+		// keeping reads non-blocking while maintaining thread safety.
+		//
+		// Warning: This depends on xpty internals and may break if xpty changes.
+		// A proper fix would require forking xpty or getting upstream changes.
+		pp := util.GetUnexportedField(xp, "pp").(*xpty.PassthroughPipe)
 		for {
-			// calling ReadRune updates the terminal state. without it,
-			// xp.State will always return an empty string
-			if _, _, err := xp.ReadRune(); err != nil {
+			r, _, err := pp.ReadRune()
+			if err != nil {
 				if err != io.EOF {
 					logger.Error("Error reading from pseudo terminal", "error", err)
 				}
@@ -55,18 +86,42 @@ func StartProcess(ctx context.Context, args StartProcessConfig) (*Process, error
 				// unresponsive.
 				return
 			}
+			process.screenUpdateLock.Lock()
+			// writing to the terminal updates its state. without it,
+			// xp.State will always return an empty string
+			xp.Term.WriteRune(r)
+			process.lastScreenUpdate = time.Now()
+			process.screenUpdateLock.Unlock()
 		}
 	}()
 
-	return &Process{xp: xp, execCmd: execCmd}, nil
+	return process, nil
 }
 
 func (p *Process) Signal(sig os.Signal) error {
 	return p.execCmd.Process.Signal(sig)
 }
 
 // ReadScreen returns the contents of the terminal window.
+// It waits for the terminal to be stable for 16ms before
+// returning, or 48 ms since it's called, whichever is sooner.
+//
+// This logic acts as a kind of vsync. Agents regularly redraw
+// parts of the screen. If we naively snapshotted the screen,
+// we'd often capture it while it's being updated. This would
+// result in a malformed agent message being returned to the
+// user.
 func (p *Process) ReadScreen() string {
+	for range 3 {
+		p.screenUpdateLock.RLock()
+		if time.Since(p.lastScreenUpdate) >= 16*time.Millisecond {
+			state := p.xp.State.String()
+			p.screenUpdateLock.RUnlock()
+			return state
+		}
+		p.screenUpdateLock.RUnlock()
+		time.Sleep(16 * time.Millisecond)
+	}
 	return p.xp.State.String()
 }
 
diff --git a/lib/util/unsafe.go b/lib/util/unsafe.go
@@ -0,0 +1,12 @@
+package util
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// Based on https://stackoverflow.com/a/60598827
+func GetUnexportedField[T any](obj *T, fieldName string) any {
+	field := reflect.ValueOf(obj).Elem().FieldByName(fieldName)
+	return reflect.NewAt(field.Type(), unsafe.Pointer(field.UnsafeAddr())).Elem().Interface()
+}