@@ -22,6 +22,7 @@ import {
2222 type ToolContext ,
2323} from '../llm/index.js' ;
2424import type { LLMError } from '../llm/llm.js' ;
25+ import { isSameToolChoice , isSameToolContext } from '../llm/tool_context.js' ;
2526import { log } from '../log.js' ;
2627import type {
2728 EOUMetrics ,
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
4344import {
4445 AudioRecognition ,
4546 type EndOfTurnInfo ,
47+ type PreemptiveGenerationInfo ,
4648 type RecognitionHooks ,
4749 type _TurnDetector ,
4850} from './audio_recognition.js' ;
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
7173// equivalent to Python's contextvars
7274const speechHandleStorage = new AsyncLocalStorage < SpeechHandle > ( ) ;
7375
76+ interface PreemptiveGeneration {
77+ speechHandle : SpeechHandle ;
78+ userMessage : ChatMessage ;
79+ info : PreemptiveGenerationInfo ;
80+ chatCtx : ChatContext ;
81+ tools : ToolContext ;
82+ toolChoice : ToolChoice | null ;
83+ createdAt : number ;
84+ }
85+
7486export class AgentActivity implements RecognitionHooks {
7587 private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000 ;
7688 private started = false ;
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
8799 private audioStream = new DeferredReadableStream < AudioFrame > ( ) ;
88100 // default to null as None, which maps to the default provider tool choice value
89101 private toolChoice : ToolChoice | null = null ;
102+ private _preemptiveGeneration ?: PreemptiveGeneration ;
90103
91104 agent : Agent ;
92105 agentSession : AgentSession ;
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
589602 this . agentSession . _updateUserState ( 'speaking' ) ;
590603 }
591604
592- onEndOfSpeech ( _ev : VADEvent ) : void {
593- this . agentSession . _updateUserState ( 'listening' ) ;
605+ onEndOfSpeech ( ev : VADEvent ) : void {
606+ let speechEndTime = Date . now ( ) ;
607+ if ( ev ) {
608+ speechEndTime = speechEndTime - ev . silenceDuration ;
609+ }
610+ this . agentSession . _updateUserState ( 'listening' , speechEndTime ) ;
594611 }
595612
596613 onVADInferenceDone ( ev : VADEvent ) : void {
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
664681 ) ;
665682 }
666683
684+ onPreemptiveGeneration ( info : PreemptiveGenerationInfo ) : void {
685+ if (
686+ ! this . agentSession . options . preemptiveGeneration ||
687+ this . draining ||
688+ ( this . _currentSpeech !== undefined && ! this . _currentSpeech . interrupted ) ||
689+ ! ( this . llm instanceof LLM )
690+ ) {
691+ return ;
692+ }
693+
694+ this . cancelPreemptiveGeneration ( ) ;
695+
696+ this . logger . info (
697+ {
698+ newTranscript : info . newTranscript ,
699+ transcriptConfidence : info . transcriptConfidence ,
700+ } ,
701+ 'starting preemptive generation' ,
702+ ) ;
703+
704+ const userMessage = ChatMessage . create ( {
705+ role : 'user' ,
706+ content : info . newTranscript ,
707+ } ) ;
708+ const chatCtx = this . agent . chatCtx . copy ( ) ;
709+ const speechHandle = this . generateReply ( {
710+ userMessage,
711+ chatCtx,
712+ scheduleSpeech : false ,
713+ } ) ;
714+
715+ this . _preemptiveGeneration = {
716+ speechHandle,
717+ userMessage,
718+ info,
719+ chatCtx : chatCtx . copy ( ) ,
720+ tools : { ...this . tools } ,
721+ toolChoice : this . toolChoice ,
722+ createdAt : Date . now ( ) ,
723+ } ;
724+ }
725+
726+ private cancelPreemptiveGeneration ( ) : void {
727+ if ( this . _preemptiveGeneration !== undefined ) {
728+ this . _preemptiveGeneration . speechHandle . _cancel ( ) ;
729+ this . _preemptiveGeneration = undefined ;
730+ }
731+ }
732+
667733 private createSpeechTask ( options : {
668734 task : Task < void > ;
669735 ownedSpeechHandle ?: SpeechHandle ;
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
694760
695761 async onEndOfTurn ( info : EndOfTurnInfo ) : Promise < boolean > {
696762 if ( this . draining ) {
763+ this . cancelPreemptiveGeneration ( ) ;
697764 this . logger . warn ( { user_input : info . newTranscript } , 'skipping user input, task is draining' ) ;
698765 // copied from python:
699766 // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
710777 info . newTranscript . split ( ' ' ) . length < this . agentSession . options . minInterruptionWords
711778 ) {
712779 // avoid interruption if the new_transcript is too short
780+ this . cancelPreemptiveGeneration ( ) ;
713781 this . logger . info ( 'skipping user input, new_transcript is too short' ) ;
714782 return false ;
715783 }
@@ -775,13 +843,15 @@ export class AgentActivity implements RecognitionHooks {
775843 instructions ?: string ;
776844 toolChoice ?: ToolChoice | null ;
777845 allowInterruptions ?: boolean ;
846+ scheduleSpeech ?: boolean ;
778847 } ) : SpeechHandle {
779848 const {
780849 userMessage,
781850 chatCtx,
782851 instructions : defaultInstructions ,
783852 toolChoice : defaultToolChoice ,
784853 allowInterruptions : defaultAllowInterruptions ,
854+ scheduleSpeech = true ,
785855 } = options ;
786856
787857 let instructions = defaultInstructions ;
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
871941 task . finally ( ( ) => this . onPipelineReplyDone ( ) ) ;
872942 }
873943
874- this . scheduleSpeech ( handle , SpeechHandle . SPEECH_PRIORITY_NORMAL ) ;
944+ if ( scheduleSpeech ) {
945+ this . scheduleSpeech ( handle , SpeechHandle . SPEECH_PRIORITY_NORMAL ) ;
946+ }
875947 return handle ;
876948 }
877949
@@ -977,16 +1049,48 @@ export class AgentActivity implements RecognitionHooks {
9771049 return ;
9781050 }
9791051
980- // Ensure the new message is passed to generateReply
981- // This preserves the original message id, making it easier for users to track responses
982- const speechHandle = this . generateReply ( { userMessage, chatCtx } ) ;
1052+ let speechHandle : SpeechHandle | undefined ;
1053+ if ( this . _preemptiveGeneration !== undefined ) {
1054+ const preemptive = this . _preemptiveGeneration ;
1055+ // make sure the onUserTurnCompleted didn't change some request parameters
1056+ // otherwise invalidate the preemptive generation
1057+ if (
1058+ preemptive . info . newTranscript === userMessage ?. textContent &&
1059+ preemptive . chatCtx . isEquivalent ( chatCtx ) &&
1060+ isSameToolContext ( preemptive . tools , this . tools ) &&
1061+ isSameToolChoice ( preemptive . toolChoice , this . toolChoice )
1062+ ) {
1063+ speechHandle = preemptive . speechHandle ;
1064+ this . scheduleSpeech ( speechHandle , SpeechHandle . SPEECH_PRIORITY_NORMAL ) ;
1065+ this . logger . debug (
1066+ {
1067+ preemptiveLeadTime : Date . now ( ) - preemptive . createdAt ,
1068+ } ,
1069+ 'using preemptive generation' ,
1070+ ) ;
1071+ } else {
1072+ this . logger . warn (
1073+ 'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`' ,
1074+ ) ;
1075+ preemptive . speechHandle . _cancel ( ) ;
1076+ }
1077+
1078+ this . _preemptiveGeneration = undefined ;
1079+ }
1080+
1081+ if ( speechHandle === undefined ) {
1082+ // Ensure the new message is passed to generateReply
1083+ // This preserves the original message id, making it easier for users to track responses
1084+ speechHandle = this . generateReply ( { userMessage, chatCtx } ) ;
1085+ }
9831086
9841087 const eouMetrics : EOUMetrics = {
9851088 type : 'eou_metrics' ,
9861089 timestamp : Date . now ( ) ,
9871090 endOfUtteranceDelayMs : info . endOfUtteranceDelay ,
9881091 transcriptionDelayMs : info . transcriptionDelay ,
9891092 onUserTurnCompletedDelayMs : callbackDuration ,
1093+ lastSpeakingTimeMs : info . stoppedSpeakingAt ?? 0 ,
9901094 speechId : speechHandle . id ,
9911095 } ;
9921096
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
11391243
11401244 chatCtx = chatCtx . copy ( ) ;
11411245
1246+ // Insert new message into temporary chat context for LLM inference
11421247 if ( newMessage ) {
11431248 chatCtx . insert ( newMessage ) ;
1144- this . agent . _chatCtx . insert ( newMessage ) ;
1145- this . agentSession . _conversationItemAdded ( newMessage ) ;
11461249 }
11471250
11481251 if ( instructions ) {
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
11571260 }
11581261 }
11591262
1160- this . agentSession . _updateAgentState ( 'thinking' ) ;
11611263 const tasks : Array < Task < void > > = [ ] ;
11621264 const [ llmTask , llmGenData ] = performLLMInference (
11631265 // preserve `this` context in llmNode
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
11851287
11861288 await speechHandle . waitIfNotInterrupted ( [ speechHandle . _waitForScheduled ( ) ] ) ;
11871289
1290+ // Add new message to actual chat context if the speech is scheduled
1291+ if ( newMessage && speechHandle . scheduled ) {
1292+ this . agent . _chatCtx . insert ( newMessage ) ;
1293+ this . agentSession . _conversationItemAdded ( newMessage ) ;
1294+ }
1295+
11881296 if ( speechHandle . interrupted ) {
11891297 replyAbortController . abort ( ) ;
11901298 await cancelAndWait ( tasks , AgentActivity . REPLY_TASK_CANCEL_TIMEOUT ) ;
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
19172025 try {
19182026 if ( this . _draining ) return ;
19192027
2028+ this . cancelPreemptiveGeneration ( ) ;
19202029 this . createSpeechTask ( {
19212030 task : Task . from ( ( ) => this . agent . onExit ( ) ) ,
19222031 name : 'AgentActivity_onExit' ,
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
19372046 this . logger . warn ( 'task closing without draining' ) ;
19382047 }
19392048
2049+ this . cancelPreemptiveGeneration ( ) ;
19402050 // Unregister event handlers to prevent duplicate metrics
19412051 if ( this . llm instanceof LLM ) {
19422052 this . llm . off ( 'metrics_collected' , this . onMetricsCollected ) ;
0 commit comments