UCL-VR
diff --git a/‎Node/apps/conversational_agent/app.ts‎
Lines changed: 45 additions & 100 deletions b/‎Node/apps/conversational_agent/app.ts‎
Lines changed: 45 additions & 100 deletions
diff --git a/‎Node/apps/stream_describer/app.ts‎
Lines changed: 8 additions & 13 deletions b/‎Node/apps/stream_describer/app.ts‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎Node/components/audio_sender.ts‎
Lines changed: 87 additions & 0 deletions b/‎Node/components/audio_sender.ts‎
Lines changed: 87 additions & 0 deletions
@@ -1,10 +1,10 @@
-import { NetworkId } from 'ubiq-server/ubiq';
 import { ApplicationController } from '../../components/application';
 import { TextToSpeechService } from '../../services/text_to_speech/service';
 import { SpeechToTextService } from '../../services/speech_to_text/service';
 import { TextGenerationService } from '../../services/text_generation/service';
 import { AudioToAudioService } from '../../services/audio_to_audio/service';
 import { VoipReceiver } from '../../components/voip_receiver';
+import { AudioSender } from '../../components/audio_sender';
 import {
     encodePacket,
     LengthPrefixedParser,
@@ -20,17 +20,6 @@ import { RTCAudioData } from '@roamhq/wrtc/types/nonstandard';
 import { fileURLToPath } from 'url';
 import nconf from 'nconf';
 
-/**
- * How many milliseconds of model audio to accumulate before flushing to Unity.
- * Batching reduces AudioInfo message overhead and gives Unity's audio system
- * a larger buffer to work with, preventing playback stutter / latency.
- *
- * The model produces one 80 ms frame per step (~12.5 fps).
- * 240 ms ≈ 3 frames — a good trade-off between latency and smoothness.
- * Override with UBIQ_AUDIO_BATCH_MS.
- */
-const AUDIO_BATCH_MS = Number(process.env.UBIQ_AUDIO_BATCH_MS) || 240;
-
 export class ConversationalAgent extends ApplicationController {
     components: {
         voipReceiver?: VoipReceiver;
@@ -39,25 +28,17 @@ export class ConversationalAgent extends ApplicationController {
         textToSpeechService?: TextToSpeechService;
         audioToAudioService?: AudioToAudioService;
     } = {};
+
+    /** Shared sender that handles AudioInfo headers + chunked PCM protocol. */
+    private audioSender!: AudioSender;
     targetPeerQueue: string[] = [];
 
     /** Tracks the UUID of the peer that most recently sent audio. */
     private lastAudioSenderUuid: string = '';
 
-    /** Whether the PersonaPlex handshake has been received. */
-    private personaplexReady: boolean = false;
-
     /** Parser instance for decoding PersonaPlex stdout framing. */
     private stdoutParser: LengthPrefixedParser = new LengthPrefixedParser();
 
-    // --- Audio output batching ---
-    /** Accumulated audio buffers (48 kHz PCM16LE) waiting to be sent to Unity. */
-    private audioOutputQueue: Buffer[] = [];
-    /** Total byte length of buffers currently in audioOutputQueue. */
-    private audioOutputQueueBytes: number = 0;
-    /** Timer handle for the periodic audio flush. */
-    private audioFlushTimer: ReturnType<typeof setTimeout> | null = null;
-
     constructor(configFile: string = 'config.json') {
         super(configFile);
     }
@@ -81,6 +62,9 @@ export class ConversationalAgent extends ApplicationController {
     }
 
     registerComponents() {
+        // Centralised audio sender — includes sampleRate in every AudioInfo header
+        this.audioSender = new AudioSender(this.scene, 95, 48000);
+
         // A VoipReceiver to receive audio data from peers via WebRTC VOIP
         this.components.voipReceiver = new VoipReceiver(this.scene);
 
@@ -108,12 +92,20 @@ export class ConversationalAgent extends ApplicationController {
     /**
      * Audio-to-audio pipeline: audio from peers is downsampled to 24 kHz,
      * framed with the PersonaPlex binary protocol, and sent to the model.
-     * Model output (audio + text) is parsed, upsampled to 48 kHz, batched,
-     * and sent to Unity periodically (every AUDIO_BATCH_MS milliseconds).
+     * Model output (audio + text) is parsed, upsampled to 48 kHz, and sent
+     * to Unity immediately as raw PCM chunks.
+     *
+     * A single AudioInfo header is sent when the first audio frame arrives
+     * from the model. Subsequent frames are streamed as raw PCM chunks
+     * without new headers — this avoids Unity's `dropOnNewSequence`
+     * clearing the playback queue on every header.
      */
     private defineAudioToAudioPipeline() {
         const service = this.components.audioToAudioService!;
 
+        /** Whether the initial AudioInfo header has been sent for this stream. */
+        let streamStarted = false;
+
         // ---- Input: receive 48 kHz PCM16 from WebRTC ----
         this.components.voipReceiver?.on('audio', (uuid: string, data: RTCAudioData) => {
             if (this.roomClient.peers.get(uuid) === undefined) {
@@ -122,13 +114,17 @@ export class ConversationalAgent extends ApplicationController {
 
             // Don't send audio to the model before it's ready — it would pile
             // up in the OS pipe buffer and create a stale backlog.
-            if (!this.personaplexReady) {
+            if (service.state !== 'ready' && service.state !== 'idle') {
                 return;
             }
 
             this.lastAudioSenderUuid = uuid;
 
-            const sampleBuffer = Buffer.from(data.samples.buffer);
+            const sampleBuffer = Buffer.from(
+                data.samples.buffer,
+                data.samples.byteOffset,
+                data.samples.byteLength,
+            );
             const downsampled = downsample48kTo24k(sampleBuffer);
             const packet = encodePacket(KIND_AUDIO, downsampled);
 
@@ -148,13 +144,24 @@ export class ConversationalAgent extends ApplicationController {
             for (const packet of packets) {
                 switch (packet.kind) {
                     case KIND_HANDSHAKE:
-                        this.personaplexReady = true;
+                        service.setReady();
                         this.log('PersonaPlex handshake received — model is ready');
                         break;
 
                     case KIND_AUDIO: {
                         const upsampled = upsample24kTo48k(packet.payload);
-                        this.enqueueAudioForUnity(upsampled);
+
+                        // Send one AudioInfo at stream start so Unity knows the
+                        // sample rate and target peer. After that, send only raw
+                        // PCM chunks — no new headers that would clear the queue.
+                        if (!streamStarted) {
+                            const targetPeerObj = this.roomClient.peers.get(this.lastAudioSenderUuid);
+                            const targetPeer = targetPeerObj?.properties.get('ubiq.displayname') ?? '';
+                            this.audioSender.sendHeader({ targetPeer });
+                            streamStarted = true;
+                        }
+
+                        this.audioSender.sendChunks(upsampled);
                         break;
                     }
 
@@ -178,78 +185,27 @@ export class ConversationalAgent extends ApplicationController {
         });
 
         service.on('close', (code: number | null, signal: string | null, identifier: string) => {
-            this.flushAudioToUnity();
+            streamStarted = false;
             this.flushStream();
             this.log(`PersonaPlex process ${identifier} exited (code=${code}, signal=${signal})`, 'warning');
-            this.personaplexReady = false;
             this.stdoutParser.reset();
         });
     }
 
-    // ---- Audio output batching helpers ----
-
-    /**
-     * Queue upsampled audio and schedule a batched send to Unity.
-     *
-     * Instead of sending each 80 ms model frame individually (which causes
-     * Unity to receive a new AudioInfo message 12.5×/sec and potentially
-     * introduces playback startup overhead per message), we accumulate
-     * frames and flush them as one larger AudioInfo batch every AUDIO_BATCH_MS.
-     */
-    private enqueueAudioForUnity(upsampled: Buffer): void {
-        this.audioOutputQueue.push(upsampled);
-        this.audioOutputQueueBytes += upsampled.length;
-
-        // Start the flush timer on the first enqueued frame
-        if (this.audioFlushTimer === null) {
-            this.audioFlushTimer = setTimeout(() => this.flushAudioToUnity(), AUDIO_BATCH_MS);
-        }
-    }
-
-    /**
-     * Flush all queued audio to Unity as a single AudioInfo + data batch.
-     */
-    private flushAudioToUnity(): void {
-        if (this.audioFlushTimer !== null) {
-            clearTimeout(this.audioFlushTimer);
-            this.audioFlushTimer = null;
-        }
-
-        if (this.audioOutputQueue.length === 0) {
-            return;
-        }
-
-        const combined = Buffer.concat(this.audioOutputQueue);
-        this.audioOutputQueue = [];
-        this.audioOutputQueueBytes = 0;
-
-        // Resolve target peer
-        const targetPeerObj = this.roomClient.peers.get(this.lastAudioSenderUuid);
-        const targetPeer = targetPeerObj?.properties.get('ubiq.displayname') ?? '';
-
-        // Send one AudioInfo for the entire batch
-        this.scene.send(new NetworkId(95), {
-            type: 'AudioInfo',
-            targetPeer: targetPeer,
-            audioLength: combined.length,
-        });
-
-        let remaining = combined;
-        while (remaining.length > 0) {
-            this.scene.send(new NetworkId(95), remaining.subarray(0, 16000));
-            remaining = remaining.subarray(16000);
-        }
-    }
-
     /**
      * Traditional 3-stage pipeline: STT → text generation → TTS.
      * This is the original pipeline, preserved for backwards compatibility.
      */
     private defineTraditionalPipeline() {
         // Step 1: When we receive audio data from a peer we send it to the transcription service
         this.components.voipReceiver?.on('audio', (uuid: string, data: RTCAudioData) => {
-            // Convert the Int16Array to a Buffer
-            const sampleBuffer = Buffer.from(data.samples.buffer);
+            // Convert the Int16Array to a Buffer (use byteOffset/byteLength
+            // in case the TypedArray is a view into a larger ArrayBuffer)
+            const sampleBuffer = Buffer.from(
+                data.samples.buffer,
+                data.samples.byteOffset,
+                data.samples.byteLength,
+            );
 
             // Send the audio data to the transcription service
             if (this.roomClient.peers.get(uuid) !== undefined) {
@@ -296,19 +252,8 @@ export class ConversationalAgent extends ApplicationController {
         });
 
         this.components.textToSpeechService?.on('data', (data: Buffer, identifier: string) => {
-            let response = data;
             const targetPeer = this.targetPeerQueue.shift() ?? '';
-
-            this.scene.send(new NetworkId(95), {
-                type: 'AudioInfo',
-                targetPeer: targetPeer,
-                audioLength: data.length,
-            });
-
-            while (response.length > 0) {
-                this.scene.send(new NetworkId(95), response.slice(0, 16000));
-                response = response.slice(16000);
-            }
+            this.audioSender.send(data, { targetPeer });
         });
     }
 }
 
@@ -4,6 +4,7 @@ import { MediaReceiver } from '../../components/media_receiver';
 import { MessageReader } from '../../components/message_reader';
 import { VisualQuestionAnsweringService } from '../../services/visual_question_answering/service';
 import { TextToSpeechService } from '../../services/text_to_speech/service';
+import { AudioSender } from '../../components/audio_sender';
 import path from 'path';
 import { fileURLToPath } from 'url';
 import nconf from 'nconf';
@@ -37,6 +38,9 @@ class StreamDescriber extends ApplicationController {
         tts?: TextToSpeechService;
     } = {};
 
+    /** Shared sender for TTS audio — includes sampleRate in AudioInfo headers. */
+    private audioSender!: AudioSender;
+
     /** Timestamp of the last frame sent per peer, to throttle. */
     private lastFrameTime = new Map<string, number>();
 
@@ -83,6 +87,9 @@ class StreamDescriber extends ApplicationController {
     }
 
     registerComponents(): void {
+        // Centralised audio sender — includes sampleRate in every AudioInfo header
+        this.audioSender = new AudioSender(this.scene, AUDIO_NETWORK_ID, 48000);
+
         // MediaReceiver to receive video tracks sent by MediaTrackManager
         this.components.mediaReceiver = new MediaReceiver(this.scene);
 
@@ -287,19 +294,7 @@ class StreamDescriber extends ApplicationController {
 
         this.log(`Sending ${combined.length} bytes of TTS audio to Unity`);
 
-        // One AudioInfo for the entire speech sequence
-        this.scene.send(new NetworkId(AUDIO_NETWORK_ID), {
-            type: 'AudioInfo',
-            audioLength: combined.length.toString(),
-        });
-
-        // Stream the raw PCM16 data
-        let offset = 0;
-        while (offset < combined.length) {
-            const end = Math.min(offset + 16000, combined.length);
-            this.scene.send(new NetworkId(AUDIO_NETWORK_ID), combined.subarray(offset, end));
-            offset = end;
-        }
+        this.audioSender.send(combined);
     }
 }
 
 
@@ -0,0 +1,87 @@
+import { NetworkId, NetworkScene } from 'ubiq-server/ubiq';
+
+/** Maximum bytes of raw PCM16 data per network message. */
+const MAX_CHUNK_BYTES = 16000;
+
+/** Default sample rate when none is specified. */
+const DEFAULT_SAMPLE_RATE = 48000;
+
+/**
+ * Encapsulates the AudioInfo + chunked-PCM16 protocol used by Ubiq-Genie
+ * to stream audio from the Node server to Unity's InjectableAudioSource.
+ *
+ * Using this component instead of inline `scene.send` calls ensures that:
+ *  - The `sampleRate` field is always included in AudioInfo headers so that
+ *    Unity can resample when the device output rate differs.
+ *  - The chunked-send loop and header format are consistent across all apps.
+ */
+export class AudioSender {
+    private readonly scene: NetworkScene;
+    private readonly networkId: NetworkId;
+    private readonly sampleRate: number;
+
+    /**
+     * @param scene      The Ubiq NetworkScene to send messages on.
+     * @param networkId  The network ID that InjectableAudioSource listens on.
+     * @param sampleRate The sample rate of the PCM16 audio that will be sent
+     *                   (Hz). This is included in every AudioInfo header so
+     *                   Unity can resample if its output rate differs.
+     *                   Defaults to 48 000.
+     */
+    constructor(scene: NetworkScene, networkId: number | NetworkId, sampleRate: number = DEFAULT_SAMPLE_RATE) {
+        this.scene = scene;
+        this.networkId = typeof networkId === 'number' ? new NetworkId(networkId) : networkId;
+        this.sampleRate = sampleRate;
+    }
+
+    /**
+     * Send a complete audio buffer to Unity using the AudioInfo + chunked-PCM
+     * protocol. Sends an AudioInfo header followed by the raw data chunks.
+     *
+     * Best for **one-shot** audio (e.g. TTS responses) where each call
+     * represents a discrete utterance. For continuous streaming (e.g.
+     * audio-to-audio), use `sendHeader()` once and then `sendChunks()` for
+     * each frame to avoid Unity clearing its playback queue on every header.
+     *
+     * @param audio   Raw PCM16-LE mono audio bytes at `this.sampleRate`.
+     * @param options Optional metadata included in the AudioInfo header.
+     */
+    send(audio: Buffer, options?: { targetPeer?: string }): void {
+        if (audio.length === 0) return;
+        this.sendHeader({ ...options, audioLength: audio.length });
+        this.sendChunks(audio);
+    }
+
+    /**
+     * Send only the AudioInfo header (no audio data).
+     *
+     * Use this once at the start of a continuous stream, then call
+     * `sendChunks()` for each audio frame. This avoids the queue-clearing
+     * side-effect of `dropOnNewSequence` on the Unity side.
+     */
+    sendHeader(options?: { targetPeer?: string; audioLength?: number }): void {
+        this.scene.send(this.networkId, {
+            type: 'AudioInfo',
+            targetPeer: options?.targetPeer ?? '',
+            audioLength: (options?.audioLength ?? 0).toString(),
+            sampleRate: this.sampleRate.toString(),
+        });
+    }
+
+    /**
+     * Send raw PCM16 data chunks without an AudioInfo header.
+     *
+     * Each chunk is at most MAX_CHUNK_BYTES. Unity's InjectableAudioSource
+     * treats messages ≥ 200 bytes that are not valid JSON as raw PCM audio,
+     * so no preceding AudioInfo is needed for the data to be played.
+     */
+    sendChunks(audio: Buffer): void {
+        if (audio.length === 0) return;
+        let offset = 0;
+        while (offset < audio.length) {
+            const end = Math.min(offset + MAX_CHUNK_BYTES, audio.length);
+            this.scene.send(this.networkId, audio.subarray(offset, end));
+            offset = end;
+        }
+    }
+}