Skip to content

Commit 14d5117

Browse files
committed
feat(ext): add streaming-tts-openai extension pack with adaptive sentence chunking
1 parent 55c2363 commit 14d5117

File tree

12 files changed

+1521
-0
lines changed

12 files changed

+1521
-0
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
---
2+
name: streaming-tts-openai
3+
description: Streaming text-to-speech via OpenAI Audio Speech API with adaptive sentence chunking
4+
category: voice
5+
---
6+
7+
# OpenAI Streaming TTS
8+
9+
Low-latency streaming text-to-speech using OpenAI's TTS API. Buffers incoming LLM tokens into
10+
natural sentence chunks before making API requests, enabling audio playback to begin within
11+
the first sentence rather than waiting for full LLM output.
12+
13+
## Setup
14+
15+
Set `OPENAI_API_KEY` in your environment or agent secrets store.
16+
17+
## Features
18+
19+
- Adaptive sentence chunking: emits audio after each `.`, `?`, `!`, or `;` boundary
20+
- Fallback flush timer (default 2 000 ms) for fragments without punctuation
21+
- Concurrent fetch pipelining: starts fetching the next sentence while the current one plays
22+
- AbortController-based cancellation for all in-flight requests
23+
- Supports all OpenAI TTS voices: alloy, echo, fable, onyx, nova, shimmer
24+
- Configurable model (tts-1, tts-1-hd) and output format (opus, mp3, aac, flac, wav, pcm)
25+
26+
## Configuration
27+
28+
In `agent.config.json`:
29+
30+
```json
31+
{
32+
"voice": {
33+
"tts": "openai"
34+
}
35+
}
36+
```
37+
38+
Provider-specific options via `providerOptions`:
39+
40+
```json
41+
{
42+
"voice": {
43+
"tts": "openai",
44+
"providerOptions": {
45+
"model": "tts-1",
46+
"voice": "nova",
47+
"format": "opus",
48+
"maxBufferMs": 2000
49+
}
50+
}
51+
}
52+
```
53+
54+
## Events
55+
56+
| Event | Payload | Description |
57+
|----------------------|-------------------------|----------------------------------------------------|
58+
| `utterance_start` | `{ text: string }` | Sentence chunk dispatched for synthesis |
59+
| `audio_chunk` | `EncodedAudioChunk` | Synthesized audio buffer ready for playback |
60+
| `utterance_complete` | `{ text, durationMs }` | Synthesis complete for a sentence chunk |
61+
| `cancelled` | `{ remaining: string }` | Session was cancelled; remaining text not rendered |
62+
| `error` | `Error` | Synthesis request failed |
63+
| `close` || Session fully terminated |
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"name": "@framers/agentos-ext-streaming-tts-openai",
3+
"version": "0.1.0",
4+
"description": "Streaming TTS via OpenAI Audio Speech API with adaptive sentence chunking",
5+
"kind": "streaming-tts-provider",
6+
"extensionId": "streaming-tts-openai",
7+
"entryPoint": "./dist/index.js"
8+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"name": "@framers/agentos-ext-streaming-tts-openai",
3+
"version": "0.1.0",
4+
"description": "Streaming TTS via OpenAI Audio Speech API with adaptive sentence chunking for AgentOS voice pipeline",
5+
"type": "module",
6+
"main": "./dist/index.js",
7+
"types": "./dist/index.d.ts",
8+
"exports": { ".": { "import": "./dist/index.js", "types": "./dist/index.d.ts" } },
9+
"files": ["dist", "src", "SKILL.md", "manifest.json"],
10+
"scripts": { "build": "tsc -p tsconfig.json", "test": "vitest run" },
11+
"peerDependencies": { "@framers/agentos": "^0.1.0" },
12+
"dependencies": {},
13+
"devDependencies": { "@framers/agentos": "workspace:*", "typescript": "^5.5.0", "vitest": "^1.6.0" },
14+
"license": "MIT",
15+
"author": "Frame.dev",
16+
"repository": { "type": "git", "url": "https://github.com/framersai/agentos-extensions.git", "directory": "registry/curated/voice/streaming-tts-openai" },
17+
"publishConfig": { "access": "public" }
18+
}
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/**
2+
* @file AdaptiveSentenceChunker.ts
3+
* @description Buffers LLM token stream and emits complete sentence chunks.
4+
*
5+
* {@link AdaptiveSentenceChunker} sits between the LLM token stream and the
6+
* TTS synthesis pipeline. Rather than synthesising each tiny token fragment
7+
* individually, it accumulates tokens and emits a `'sentence'` event whenever
8+
* a sentence boundary is detected (`.`, `?`, `!`, `;` followed by whitespace
9+
* or end-of-input). A fallback flush timer ensures audio is never blocked
10+
* indefinitely on fragments that lack terminal punctuation.
11+
*
12+
* @module streaming-tts-openai/AdaptiveSentenceChunker
13+
*/
14+
15+
import { EventEmitter } from 'node:events';
16+
17+
// ---------------------------------------------------------------------------
18+
// Regex — compiled once at module load
19+
// ---------------------------------------------------------------------------
20+
21+
/**
22+
* Matches the first complete sentence within a string.
23+
*
24+
* Capture group 1: the sentence (including its terminal punctuation mark).
25+
* Capture group 2: remaining text after the inter-sentence whitespace.
26+
*
27+
* The `s` flag enables `.` to match newlines so multi-line inputs work.
28+
*/
29+
const SENTENCE_BOUNDARY = /^(.*?[.?!;])\s(.*)/s;
30+
31+
// ---------------------------------------------------------------------------
32+
// Class
33+
// ---------------------------------------------------------------------------
34+
35+
/**
36+
* Token accumulator that splits LLM output into TTS-friendly sentence chunks.
37+
*
38+
* ### Events
39+
*
40+
* | Event | Payload | Description |
41+
* |--------------|----------|------------------------------------------------|
42+
* | `'sentence'` | `string` | A complete sentence ready for TTS synthesis |
43+
*
44+
* ### Usage
45+
* ```ts
46+
* const chunker = new AdaptiveSentenceChunker(2000);
47+
* chunker.on('sentence', (text) => tts.synthesise(text));
48+
*
49+
* llm.on('token', (tok) => chunker.pushTokens(tok));
50+
* llm.on('end', () => chunker.flush());
51+
* ```
52+
*/
53+
export class AdaptiveSentenceChunker extends EventEmitter {
54+
/** Accumulated text waiting for a sentence boundary. */
55+
private buffer: string = '';
56+
57+
/** Handle for the fallback flush timer; `null` when inactive. */
58+
private flushTimer: NodeJS.Timeout | null = null;
59+
60+
/**
61+
* @param maxBufferMs - Maximum time in milliseconds to hold buffered text
62+
* before forcing a word-boundary flush. Defaults to 2 000 ms.
63+
*/
64+
constructor(private readonly maxBufferMs: number = 2000) {
65+
super();
66+
}
67+
68+
// -------------------------------------------------------------------------
69+
// Public API
70+
// -------------------------------------------------------------------------
71+
72+
/**
73+
* Append one or more LLM output tokens to the internal buffer and check for
74+
* sentence boundaries.
75+
*
76+
* If a boundary is found (`[.?!;]` followed by whitespace), the text up to
77+
* and including the punctuation is emitted as a `'sentence'` event and the
78+
* remainder stays in the buffer. The method recurses to catch multiple
79+
* boundaries in a single push (e.g. when a large chunk arrives at once).
80+
*
81+
* The fallback flush timer is reset on every call so that the 2 s window
82+
* always starts from the most recent token activity.
83+
*
84+
* @param text - Token fragment(s) to append. May be an empty string (used
85+
* internally to trigger boundary re-checks without appending new text).
86+
*/
87+
pushTokens(text: string): void {
88+
this.buffer += text;
89+
this.resetFlushTimer();
90+
91+
// Check for sentence boundaries: [.?!;] followed by whitespace
92+
const match = this.buffer.match(SENTENCE_BOUNDARY);
93+
if (match) {
94+
const sentence = match[1]!;
95+
this.buffer = match[2]!;
96+
this.emit('sentence', sentence);
97+
98+
// Recurse to handle multiple consecutive sentences in the buffer.
99+
if (this.buffer.length > 0) {
100+
this.pushTokens('');
101+
}
102+
}
103+
}
104+
105+
/**
106+
* Flush any remaining buffered text immediately, without waiting for the
107+
* fallback timer.
108+
*
109+
* Call this when the LLM stream has ended to ensure the final fragment is
110+
* synthesised even if it lacks terminal punctuation.
111+
*
112+
* Emits a `'sentence'` event with the trimmed buffer contents if non-empty.
113+
* Cancels the fallback timer.
114+
*/
115+
flush(): void {
116+
if (this.flushTimer) {
117+
clearTimeout(this.flushTimer);
118+
this.flushTimer = null;
119+
}
120+
121+
const remaining = this.buffer.trim();
122+
if (remaining.length > 0) {
123+
this.buffer = '';
124+
this.emit('sentence', remaining);
125+
}
126+
}
127+
128+
/**
129+
* Immediately cancel the chunker: cancel the fallback timer, clear the
130+
* buffer, and return whatever text was pending.
131+
*
132+
* No `'sentence'` event is emitted. The caller receives the raw remaining
133+
* text so it can report it as unsynthesised content in a `'cancelled'` event.
134+
*
135+
* @returns The text that was in the buffer at the time of cancellation.
136+
*/
137+
cancel(): string {
138+
if (this.flushTimer) {
139+
clearTimeout(this.flushTimer);
140+
this.flushTimer = null;
141+
}
142+
143+
const remaining = this.buffer;
144+
this.buffer = '';
145+
return remaining;
146+
}
147+
148+
// -------------------------------------------------------------------------
149+
// Private helpers
150+
// -------------------------------------------------------------------------
151+
152+
/**
153+
* Reset the fallback flush timer.
154+
*
155+
* If text remains in the buffer after {@link maxBufferMs} milliseconds of
156+
* inactivity, the timer breaks the accumulated text at the last word
157+
* boundary (space) and emits the portion before that boundary as a sentence.
158+
* If there is no word boundary, the entire buffer is emitted verbatim.
159+
*
160+
* This prevents TTS from stalling indefinitely on bullet points, code
161+
* snippets, or other text that lacks standard sentence-ending punctuation.
162+
*/
163+
private resetFlushTimer(): void {
164+
if (this.flushTimer) {
165+
clearTimeout(this.flushTimer);
166+
}
167+
168+
this.flushTimer = setTimeout(() => {
169+
this.flushTimer = null;
170+
171+
if (this.buffer.length === 0) return;
172+
173+
// Prefer a clean word boundary over splitting mid-token.
174+
const lastSpace = this.buffer.lastIndexOf(' ');
175+
if (lastSpace > 0) {
176+
const chunk = this.buffer.slice(0, lastSpace);
177+
this.buffer = this.buffer.slice(lastSpace + 1);
178+
this.emit('sentence', chunk);
179+
} else {
180+
// No word boundary found — emit everything.
181+
const chunk = this.buffer;
182+
this.buffer = '';
183+
this.emit('sentence', chunk);
184+
}
185+
}, this.maxBufferMs);
186+
}
187+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/**
2+
* @file OpenAIStreamingTTS.ts
3+
* @description {@link IStreamingTTS}-compatible factory for OpenAI TTS sessions.
4+
*
5+
* {@link OpenAIStreamingTTS} is a thin factory that creates
6+
* {@link OpenAITTSSession} instances on demand. It holds no mutable state
7+
* other than an active-session reference count used to implement
8+
* {@link isStreaming}.
9+
*
10+
* @module streaming-tts-openai/OpenAIStreamingTTS
11+
*/
12+
13+
import { OpenAITTSSession } from './OpenAITTSSession.js';
14+
import type { OpenAIStreamingTTSConfig, StreamingTTSConfig } from './types.js';
15+
16+
// ---------------------------------------------------------------------------
17+
// Main class
18+
// ---------------------------------------------------------------------------
19+
20+
/**
21+
* Factory for OpenAI-backed streaming TTS sessions.
22+
*
23+
* Instantiate once per agent and reuse across voice turns. Each call to
24+
* {@link startSession} creates an independent {@link OpenAITTSSession} that
25+
* owns its own HTTP connection lifecycle.
26+
*
27+
* @example
28+
* ```ts
29+
* const tts = new OpenAIStreamingTTS({ apiKey: process.env.OPENAI_API_KEY! });
30+
* const session = await tts.startSession();
31+
*
32+
* session.on('audio_chunk', (chunk) => audioPlayer.enqueue(chunk.audio));
33+
*
34+
* llm.on('token', (tok) => session.pushTokens(tok));
35+
* llm.on('end', () => session.flush());
36+
* ```
37+
*/
38+
export class OpenAIStreamingTTS {
39+
/**
40+
* Stable provider identifier used by the voice pipeline to select between
41+
* registered TTS implementations.
42+
*/
43+
readonly providerId = 'openai-streaming-tts';
44+
45+
/** `true` while at least one session is open and not yet closed. */
46+
get isStreaming(): boolean {
47+
return this._activeSessions > 0;
48+
}
49+
50+
/** Count of sessions opened but not yet closed or cancelled. */
51+
private _activeSessions = 0;
52+
53+
/**
54+
* @param config - OpenAI provider configuration. `apiKey` is required;
55+
* all other fields have sensible defaults.
56+
*/
57+
constructor(private readonly config: OpenAIStreamingTTSConfig) {}
58+
59+
/**
60+
* Open a new streaming TTS session.
61+
*
62+
* Provider-specific options can be forwarded via `config.providerOptions`
63+
* under the following keys:
64+
* - `model` — TTS model name (default `'tts-1'`).
65+
* - `voice` — Voice preset (default `'nova'`).
66+
* - `format` — Output format (default `'opus'`).
67+
* - `maxBufferMs` — Flush timer duration ms (default `2000`).
68+
*
69+
* @param config - Generic session config merged with provider defaults.
70+
* @returns A ready-to-use {@link OpenAITTSSession}.
71+
*/
72+
async startSession(config?: StreamingTTSConfig): Promise<OpenAITTSSession> {
73+
const provOpts = (config?.providerOptions ?? {}) as Partial<OpenAIStreamingTTSConfig>;
74+
75+
// Session-level overrides take precedence over factory-level config.
76+
const sessionProviderConfig: OpenAIStreamingTTSConfig = {
77+
...this.config,
78+
model: provOpts.model ?? this.config.model,
79+
voice: provOpts.voice ?? this.config.voice,
80+
format: provOpts.format ?? this.config.format,
81+
maxBufferMs: provOpts.maxBufferMs ?? this.config.maxBufferMs,
82+
};
83+
84+
const session = new OpenAITTSSession(sessionProviderConfig, config ?? {});
85+
86+
this._activeSessions++;
87+
88+
// Decrement the counter when the session terminates for any reason.
89+
const onDone = (): void => {
90+
this._activeSessions = Math.max(0, this._activeSessions - 1);
91+
};
92+
session.once('close', onDone);
93+
session.once('cancelled', onDone);
94+
95+
return session;
96+
}
97+
}

0 commit comments

Comments
 (0)