brianyin/ajs-314-preemptive-generation (#798)

toubatbrian · web-flow · commit c54c21e55f65 · 2025-11-04T00:04:24.000-08:00
diff --git a/.changeset/fine-buckets-sink.md b/.changeset/fine-buckets-sink.md
@@ -0,0 +1,5 @@
+---
+'@livekit/agents': patch
+---
+
+Add preemptive generation
diff --git a/agents/src/inference/stt.ts b/agents/src/inference/stt.ts
@@ -2,7 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import { type AudioFrame } from '@livekit/rtc-node';
-import { type RawData, WebSocket } from 'ws';
+import type { WebSocket } from 'ws';
+import { type RawData } from 'ws';
 import { APIError, APIStatusError } from '../_exceptions.js';
 import { AudioByteStream } from '../audio.js';
 import { log } from '../log.js';
diff --git a/agents/src/inference/tts.ts b/agents/src/inference/tts.ts
@@ -8,11 +8,8 @@ import { AudioByteStream } from '../audio.js';
 import { log } from '../log.js';
 import { createStreamChannel } from '../stream/stream_channel.js';
 import { basic as tokenizeBasic } from '../tokenize/index.js';
-import {
-  SynthesizeStream as BaseSynthesizeStream,
-  TTS as BaseTTS,
-  ChunkedStream,
-} from '../tts/index.js';
+import type { ChunkedStream } from '../tts/index.js';
+import { SynthesizeStream as BaseSynthesizeStream, TTS as BaseTTS } from '../tts/index.js';
 import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
 import { shortuuid } from '../utils.js';
 import {
diff --git a/agents/src/llm/tool_context.ts b/agents/src/llm/tool_context.ts
@@ -187,6 +187,50 @@ export type ToolContext<UserData = UnknownUserData> = {
   [name: string]: FunctionTool<any, UserData, any>;
 };
 
+export function isSameToolContext(ctx1: ToolContext, ctx2: ToolContext): boolean {
+  const toolNames = new Set(Object.keys(ctx1));
+  const toolNames2 = new Set(Object.keys(ctx2));
+
+  if (toolNames.size !== toolNames2.size) {
+    return false;
+  }
+
+  for (const name of toolNames) {
+    if (!toolNames2.has(name)) {
+      return false;
+    }
+
+    const tool1 = ctx1[name];
+    const tool2 = ctx2[name];
+
+    if (!tool1 || !tool2) {
+      return false;
+    }
+
+    if (tool1.description !== tool2.description) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+export function isSameToolChoice(choice1: ToolChoice | null, choice2: ToolChoice | null): boolean {
+  if (choice1 === choice2) {
+    return true;
+  }
+  if (choice1 === null || choice2 === null) {
+    return false;
+  }
+  if (typeof choice1 === 'string' && typeof choice2 === 'string') {
+    return choice1 === choice2;
+  }
+  if (typeof choice1 === 'object' && typeof choice2 === 'object') {
+    return choice1.type === choice2.type && choice1.function.name === choice2.function.name;
+  }
+  return false;
+}
+
 /**
  * Create a function tool with inferred parameters from the schema.
  */
diff --git a/agents/src/metrics/base.ts b/agents/src/metrics/base.ts
@@ -91,6 +91,13 @@ export type EOUMetrics = {
    * Time taken to invoke the user's `Agent.onUserTurnCompleted` callback.
    */
   onUserTurnCompletedDelayMs: number;
+  /**
+   * The time the user stopped speaking.
+   */
+  lastSpeakingTimeMs: number;
+  /**
+   * The ID of the speech handle.
+   */
   speechId?: string;
 };
 
diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts
@@ -38,6 +38,12 @@ export enum SpeechEventType {
   END_OF_SPEECH = 3,
   /** Usage event, emitted periodically to indicate usage metrics. */
   RECOGNITION_USAGE = 4,
+  /**
+   * Preflight transcript, emitted before final transcript when STT has high confidence
+   * but hasn't fully committed yet. Includes all pre-committed transcripts including
+   * final transcript from the previous STT run.
+   */
+  PREFLIGHT_TRANSCRIPT = 5,
 }
 
 /** SpeechData contains metadata about this {@link SpeechEvent}. */
diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts
@@ -22,6 +22,7 @@ import {
   type ToolContext,
 } from '../llm/index.js';
 import type { LLMError } from '../llm/llm.js';
+import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
   EOUMetrics,
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
 import {
   AudioRecognition,
   type EndOfTurnInfo,
+  type PreemptiveGenerationInfo,
   type RecognitionHooks,
   type _TurnDetector,
 } from './audio_recognition.js';
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
 // equivalent to Python's contextvars
 const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
 
+interface PreemptiveGeneration {
+  speechHandle: SpeechHandle;
+  userMessage: ChatMessage;
+  info: PreemptiveGenerationInfo;
+  chatCtx: ChatContext;
+  tools: ToolContext;
+  toolChoice: ToolChoice | null;
+  createdAt: number;
+}
+
 export class AgentActivity implements RecognitionHooks {
   private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
   private started = false;
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
   private audioStream = new DeferredReadableStream<AudioFrame>();
   // default to null as None, which maps to the default provider tool choice value
   private toolChoice: ToolChoice | null = null;
+  private _preemptiveGeneration?: PreemptiveGeneration;
 
   agent: Agent;
   agentSession: AgentSession;
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
     this.agentSession._updateUserState('speaking');
   }
 
-  onEndOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('listening');
+  onEndOfSpeech(ev: VADEvent): void {
+    let speechEndTime = Date.now();
+    if (ev) {
+      speechEndTime = speechEndTime - ev.silenceDuration;
+    }
+    this.agentSession._updateUserState('listening', speechEndTime);
   }
 
   onVADInferenceDone(ev: VADEvent): void {
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
     );
   }
 
+  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
+    if (
+      !this.agentSession.options.preemptiveGeneration ||
+      this.draining ||
+      (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
+      !(this.llm instanceof LLM)
+    ) {
+      return;
+    }
+
+    this.cancelPreemptiveGeneration();
+
+    this.logger.info(
+      {
+        newTranscript: info.newTranscript,
+        transcriptConfidence: info.transcriptConfidence,
+      },
+      'starting preemptive generation',
+    );
+
+    const userMessage = ChatMessage.create({
+      role: 'user',
+      content: info.newTranscript,
+    });
+    const chatCtx = this.agent.chatCtx.copy();
+    const speechHandle = this.generateReply({
+      userMessage,
+      chatCtx,
+      scheduleSpeech: false,
+    });
+
+    this._preemptiveGeneration = {
+      speechHandle,
+      userMessage,
+      info,
+      chatCtx: chatCtx.copy(),
+      tools: { ...this.tools },
+      toolChoice: this.toolChoice,
+      createdAt: Date.now(),
+    };
+  }
+
+  private cancelPreemptiveGeneration(): void {
+    if (this._preemptiveGeneration !== undefined) {
+      this._preemptiveGeneration.speechHandle._cancel();
+      this._preemptiveGeneration = undefined;
+    }
+  }
+
   private createSpeechTask(options: {
     task: Task<void>;
     ownedSpeechHandle?: SpeechHandle;
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
 
   async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
     if (this.draining) {
+      this.cancelPreemptiveGeneration();
       this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
       // copied from python:
       // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
       info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
     ) {
       // avoid interruption if the new_transcript is too short
+      this.cancelPreemptiveGeneration();
       this.logger.info('skipping user input, new_transcript is too short');
       return false;
     }
@@ -775,13 +843,15 @@ export class AgentActivity implements RecognitionHooks {
     instructions?: string;
     toolChoice?: ToolChoice | null;
     allowInterruptions?: boolean;
+    scheduleSpeech?: boolean;
   }): SpeechHandle {
     const {
       userMessage,
       chatCtx,
       instructions: defaultInstructions,
       toolChoice: defaultToolChoice,
       allowInterruptions: defaultAllowInterruptions,
+      scheduleSpeech = true,
     } = options;
 
     let instructions = defaultInstructions;
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
       task.finally(() => this.onPipelineReplyDone());
     }
 
-    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    if (scheduleSpeech) {
+      this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    }
     return handle;
   }
 
@@ -977,16 +1049,48 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
 
-    // Ensure the new message is passed to generateReply
-    // This preserves the original message id, making it easier for users to track responses
-    const speechHandle = this.generateReply({ userMessage, chatCtx });
+    let speechHandle: SpeechHandle | undefined;
+    if (this._preemptiveGeneration !== undefined) {
+      const preemptive = this._preemptiveGeneration;
+      // make sure the onUserTurnCompleted didn't change some request parameters
+      // otherwise invalidate the preemptive generation
+      if (
+        preemptive.info.newTranscript === userMessage?.textContent &&
+        preemptive.chatCtx.isEquivalent(chatCtx) &&
+        isSameToolContext(preemptive.tools, this.tools) &&
+        isSameToolChoice(preemptive.toolChoice, this.toolChoice)
+      ) {
+        speechHandle = preemptive.speechHandle;
+        this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+        this.logger.debug(
+          {
+            preemptiveLeadTime: Date.now() - preemptive.createdAt,
+          },
+          'using preemptive generation',
+        );
+      } else {
+        this.logger.warn(
+          'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
+        );
+        preemptive.speechHandle._cancel();
+      }
+
+      this._preemptiveGeneration = undefined;
+    }
+
+    if (speechHandle === undefined) {
+      // Ensure the new message is passed to generateReply
+      // This preserves the original message id, making it easier for users to track responses
+      speechHandle = this.generateReply({ userMessage, chatCtx });
+    }
 
     const eouMetrics: EOUMetrics = {
       type: 'eou_metrics',
       timestamp: Date.now(),
       endOfUtteranceDelayMs: info.endOfUtteranceDelay,
       transcriptionDelayMs: info.transcriptionDelay,
       onUserTurnCompletedDelayMs: callbackDuration,
+      lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
       speechId: speechHandle.id,
     };
 
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
 
     chatCtx = chatCtx.copy();
 
+    // Insert new message into temporary chat context for LLM inference
     if (newMessage) {
       chatCtx.insert(newMessage);
-      this.agent._chatCtx.insert(newMessage);
-      this.agentSession._conversationItemAdded(newMessage);
     }
 
     if (instructions) {
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
       }
     }
 
-    this.agentSession._updateAgentState('thinking');
     const tasks: Array<Task<void>> = [];
     const [llmTask, llmGenData] = performLLMInference(
       // preserve  `this` context in llmNode
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
 
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
 
+    // Add new message to actual chat context if the speech is scheduled
+    if (newMessage && speechHandle.scheduled) {
+      this.agent._chatCtx.insert(newMessage);
+      this.agentSession._conversationItemAdded(newMessage);
+    }
+
     if (speechHandle.interrupted) {
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
     try {
       if (this._draining) return;
 
+      this.cancelPreemptiveGeneration();
       this.createSpeechTask({
         task: Task.from(() => this.agent.onExit()),
         name: 'AgentActivity_onExit',
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
         this.logger.warn('task closing without draining');
       }
 
+      this.cancelPreemptiveGeneration();
       // Unregister event handlers to prevent duplicate metrics
       if (this.llm instanceof LLM) {
         this.llm.off('metrics_collected', this.onMetricsCollected);
diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts
@@ -57,6 +57,7 @@ export interface VoiceOptions {
   minEndpointingDelay: number;
   maxEndpointingDelay: number;
   maxToolSteps: number;
+  preemptiveGeneration: boolean;
 }
 
 const defaultVoiceOptions: VoiceOptions = {
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
   minEndpointingDelay: 500,
   maxEndpointingDelay: 6000,
   maxToolSteps: 3,
+  preemptiveGeneration: false,
 } as const;
 
 export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -421,7 +423,7 @@ export class AgentSession<
   }
 
   /** @internal */
-  _updateUserState(state: UserState) {
+  _updateUserState(state: UserState, _lastSpeakingTime?: number) {
     if (this.userState === state) {
       return;
     }
diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts
diff --git a/agents/src/voice/room_io/_input.ts b/agents/src/voice/room_io/_input.ts
diff --git a/agents/src/worker.ts b/agents/src/worker.ts
diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +'@livekit/agents': patch
 +---
++
 +Add preemptive generation