framersai
diff --git a/‎registry/curated/voice/streaming-tts-openai/SKILL.md‎
Lines changed: 63 additions & 0 deletions b/‎registry/curated/voice/streaming-tts-openai/SKILL.md‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎registry/curated/voice/streaming-tts-openai/manifest.json‎
Lines changed: 8 additions & 0 deletions b/‎registry/curated/voice/streaming-tts-openai/manifest.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎registry/curated/voice/streaming-tts-openai/package.json‎
Lines changed: 18 additions & 0 deletions b/‎registry/curated/voice/streaming-tts-openai/package.json‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎registry/curated/voice/streaming-tts-openai/src/AdaptiveSentenceChunker.ts‎
Lines changed: 187 additions & 0 deletions b/‎registry/curated/voice/streaming-tts-openai/src/AdaptiveSentenceChunker.ts‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎registry/curated/voice/streaming-tts-openai/src/OpenAIStreamingTTS.ts‎
Lines changed: 97 additions & 0 deletions b/‎registry/curated/voice/streaming-tts-openai/src/OpenAIStreamingTTS.ts‎
Lines changed: 97 additions & 0 deletions
@@ -0,0 +1,63 @@
+---
+name: streaming-tts-openai
+description: Streaming text-to-speech via OpenAI Audio Speech API with adaptive sentence chunking
+category: voice
+---
+
+# OpenAI Streaming TTS
+
+Low-latency streaming text-to-speech using OpenAI's TTS API. Buffers incoming LLM tokens into
+natural sentence chunks before making API requests, enabling audio playback to begin within
+the first sentence rather than waiting for full LLM output.
+
+## Setup
+
+Set `OPENAI_API_KEY` in your environment or agent secrets store.
+
+## Features
+
+- Adaptive sentence chunking: emits audio after each `.`, `?`, `!`, or `;` boundary
+- Fallback flush timer (default 2 000 ms) for fragments without punctuation
+- Concurrent fetch pipelining: starts fetching the next sentence while the current one plays
+- AbortController-based cancellation for all in-flight requests
+- Supports all OpenAI TTS voices: alloy, echo, fable, onyx, nova, shimmer
+- Configurable model (tts-1, tts-1-hd) and output format (opus, mp3, aac, flac, wav, pcm)
+
+## Configuration
+
+In `agent.config.json`:
+
+```json
+{
+  "voice": {
+    "tts": "openai"
+  }
+}
+```
+
+Provider-specific options via `providerOptions`:
+
+```json
+{
+  "voice": {
+    "tts": "openai",
+    "providerOptions": {
+      "model": "tts-1",
+      "voice": "nova",
+      "format": "opus",
+      "maxBufferMs": 2000
+    }
+  }
+}
+```
+
+## Events
+
+| Event                | Payload                 | Description                                        |
+|----------------------|-------------------------|----------------------------------------------------|
+| `utterance_start`    | `{ text: string }`      | Sentence chunk dispatched for synthesis            |
+| `audio_chunk`        | `EncodedAudioChunk`     | Synthesized audio buffer ready for playback        |
+| `utterance_complete` | `{ text, durationMs }`  | Synthesis complete for a sentence chunk            |
+| `cancelled`          | `{ remaining: string }` | Session was cancelled; remaining text not rendered |
+| `error`              | `Error`                 | Synthesis request failed                           |
+| `close`              | —                       | Session fully terminated                           |
@@ -0,0 +1,8 @@
+{
+  "name": "@framers/agentos-ext-streaming-tts-openai",
+  "version": "0.1.0",
+  "description": "Streaming TTS via OpenAI Audio Speech API with adaptive sentence chunking",
+  "kind": "streaming-tts-provider",
+  "extensionId": "streaming-tts-openai",
+  "entryPoint": "./dist/index.js"
+}
@@ -0,0 +1,18 @@
+{
+  "name": "@framers/agentos-ext-streaming-tts-openai",
+  "version": "0.1.0",
+  "description": "Streaming TTS via OpenAI Audio Speech API with adaptive sentence chunking for AgentOS voice pipeline",
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": { ".": { "import": "./dist/index.js", "types": "./dist/index.d.ts" } },
+  "files": ["dist", "src", "SKILL.md", "manifest.json"],
+  "scripts": { "build": "tsc -p tsconfig.json", "test": "vitest run" },
+  "peerDependencies": { "@framers/agentos": "^0.1.0" },
+  "dependencies": {},
+  "devDependencies": { "@framers/agentos": "workspace:*", "typescript": "^5.5.0", "vitest": "^1.6.0" },
+  "license": "MIT",
+  "author": "Frame.dev",
+  "repository": { "type": "git", "url": "https://github.com/framersai/agentos-extensions.git", "directory": "registry/curated/voice/streaming-tts-openai" },
+  "publishConfig": { "access": "public" }
+}
@@ -0,0 +1,187 @@
+/**
+ * @file AdaptiveSentenceChunker.ts
+ * @description Buffers LLM token stream and emits complete sentence chunks.
+ *
+ * {@link AdaptiveSentenceChunker} sits between the LLM token stream and the
+ * TTS synthesis pipeline.  Rather than synthesising each tiny token fragment
+ * individually, it accumulates tokens and emits a `'sentence'` event whenever
+ * a sentence boundary is detected (`.`, `?`, `!`, `;` followed by whitespace
+ * or end-of-input).  A fallback flush timer ensures audio is never blocked
+ * indefinitely on fragments that lack terminal punctuation.
+ *
+ * @module streaming-tts-openai/AdaptiveSentenceChunker
+ */
+
+import { EventEmitter } from 'node:events';
+
+// ---------------------------------------------------------------------------
+// Regex — compiled once at module load
+// ---------------------------------------------------------------------------
+
+/**
+ * Matches the first complete sentence within a string.
+ *
+ * Capture group 1: the sentence (including its terminal punctuation mark).
+ * Capture group 2: remaining text after the inter-sentence whitespace.
+ *
+ * The `s` flag enables `.` to match newlines so multi-line inputs work.
+ */
+const SENTENCE_BOUNDARY = /^(.*?[.?!;])\s(.*)/s;
+
+// ---------------------------------------------------------------------------
+// Class
+// ---------------------------------------------------------------------------
+
+/**
+ * Token accumulator that splits LLM output into TTS-friendly sentence chunks.
+ *
+ * ### Events
+ *
+ * | Event        | Payload  | Description                                    |
+ * |--------------|----------|------------------------------------------------|
+ * | `'sentence'` | `string` | A complete sentence ready for TTS synthesis    |
+ *
+ * ### Usage
+ * ```ts
+ * const chunker = new AdaptiveSentenceChunker(2000);
+ * chunker.on('sentence', (text) => tts.synthesise(text));
+ *
+ * llm.on('token', (tok) => chunker.pushTokens(tok));
+ * llm.on('end',   ()    => chunker.flush());
+ * ```
+ */
+export class AdaptiveSentenceChunker extends EventEmitter {
+  /** Accumulated text waiting for a sentence boundary. */
+  private buffer: string = '';
+
+  /** Handle for the fallback flush timer; `null` when inactive. */
+  private flushTimer: NodeJS.Timeout | null = null;
+
+  /**
+   * @param maxBufferMs - Maximum time in milliseconds to hold buffered text
+   *   before forcing a word-boundary flush.  Defaults to 2 000 ms.
+   */
+  constructor(private readonly maxBufferMs: number = 2000) {
+    super();
+  }
+
+  // -------------------------------------------------------------------------
+  // Public API
+  // -------------------------------------------------------------------------
+
+  /**
+   * Append one or more LLM output tokens to the internal buffer and check for
+   * sentence boundaries.
+   *
+   * If a boundary is found (`[.?!;]` followed by whitespace), the text up to
+   * and including the punctuation is emitted as a `'sentence'` event and the
+   * remainder stays in the buffer.  The method recurses to catch multiple
+   * boundaries in a single push (e.g. when a large chunk arrives at once).
+   *
+   * The fallback flush timer is reset on every call so that the 2 s window
+   * always starts from the most recent token activity.
+   *
+   * @param text - Token fragment(s) to append.  May be an empty string (used
+   *   internally to trigger boundary re-checks without appending new text).
+   */
+  pushTokens(text: string): void {
+    this.buffer += text;
+    this.resetFlushTimer();
+
+    // Check for sentence boundaries: [.?!;] followed by whitespace
+    const match = this.buffer.match(SENTENCE_BOUNDARY);
+    if (match) {
+      const sentence = match[1]!;
+      this.buffer = match[2]!;
+      this.emit('sentence', sentence);
+
+      // Recurse to handle multiple consecutive sentences in the buffer.
+      if (this.buffer.length > 0) {
+        this.pushTokens('');
+      }
+    }
+  }
+
+  /**
+   * Flush any remaining buffered text immediately, without waiting for the
+   * fallback timer.
+   *
+   * Call this when the LLM stream has ended to ensure the final fragment is
+   * synthesised even if it lacks terminal punctuation.
+   *
+   * Emits a `'sentence'` event with the trimmed buffer contents if non-empty.
+   * Cancels the fallback timer.
+   */
+  flush(): void {
+    if (this.flushTimer) {
+      clearTimeout(this.flushTimer);
+      this.flushTimer = null;
+    }
+
+    const remaining = this.buffer.trim();
+    if (remaining.length > 0) {
+      this.buffer = '';
+      this.emit('sentence', remaining);
+    }
+  }
+
+  /**
+   * Immediately cancel the chunker: cancel the fallback timer, clear the
+   * buffer, and return whatever text was pending.
+   *
+   * No `'sentence'` event is emitted.  The caller receives the raw remaining
+   * text so it can report it as unsynthesised content in a `'cancelled'` event.
+   *
+   * @returns The text that was in the buffer at the time of cancellation.
+   */
+  cancel(): string {
+    if (this.flushTimer) {
+      clearTimeout(this.flushTimer);
+      this.flushTimer = null;
+    }
+
+    const remaining = this.buffer;
+    this.buffer = '';
+    return remaining;
+  }
+
+  // -------------------------------------------------------------------------
+  // Private helpers
+  // -------------------------------------------------------------------------
+
+  /**
+   * Reset the fallback flush timer.
+   *
+   * If text remains in the buffer after {@link maxBufferMs} milliseconds of
+   * inactivity, the timer breaks the accumulated text at the last word
+   * boundary (space) and emits the portion before that boundary as a sentence.
+   * If there is no word boundary, the entire buffer is emitted verbatim.
+   *
+   * This prevents TTS from stalling indefinitely on bullet points, code
+   * snippets, or other text that lacks standard sentence-ending punctuation.
+   */
+  private resetFlushTimer(): void {
+    if (this.flushTimer) {
+      clearTimeout(this.flushTimer);
+    }
+
+    this.flushTimer = setTimeout(() => {
+      this.flushTimer = null;
+
+      if (this.buffer.length === 0) return;
+
+      // Prefer a clean word boundary over splitting mid-token.
+      const lastSpace = this.buffer.lastIndexOf(' ');
+      if (lastSpace > 0) {
+        const chunk = this.buffer.slice(0, lastSpace);
+        this.buffer = this.buffer.slice(lastSpace + 1);
+        this.emit('sentence', chunk);
+      } else {
+        // No word boundary found — emit everything.
+        const chunk = this.buffer;
+        this.buffer = '';
+        this.emit('sentence', chunk);
+      }
+    }, this.maxBufferMs);
+  }
+}
@@ -0,0 +1,97 @@
+/**
+ * @file OpenAIStreamingTTS.ts
+ * @description {@link IStreamingTTS}-compatible factory for OpenAI TTS sessions.
+ *
+ * {@link OpenAIStreamingTTS} is a thin factory that creates
+ * {@link OpenAITTSSession} instances on demand.  It holds no mutable state
+ * other than an active-session reference count used to implement
+ * {@link isStreaming}.
+ *
+ * @module streaming-tts-openai/OpenAIStreamingTTS
+ */
+
+import { OpenAITTSSession } from './OpenAITTSSession.js';
+import type { OpenAIStreamingTTSConfig, StreamingTTSConfig } from './types.js';
+
+// ---------------------------------------------------------------------------
+// Main class
+// ---------------------------------------------------------------------------
+
+/**
+ * Factory for OpenAI-backed streaming TTS sessions.
+ *
+ * Instantiate once per agent and reuse across voice turns.  Each call to
+ * {@link startSession} creates an independent {@link OpenAITTSSession} that
+ * owns its own HTTP connection lifecycle.
+ *
+ * @example
+ * ```ts
+ * const tts = new OpenAIStreamingTTS({ apiKey: process.env.OPENAI_API_KEY! });
+ * const session = await tts.startSession();
+ *
+ * session.on('audio_chunk', (chunk) => audioPlayer.enqueue(chunk.audio));
+ *
+ * llm.on('token', (tok) => session.pushTokens(tok));
+ * llm.on('end',   ()    => session.flush());
+ * ```
+ */
+export class OpenAIStreamingTTS {
+  /**
+   * Stable provider identifier used by the voice pipeline to select between
+   * registered TTS implementations.
+   */
+  readonly providerId = 'openai-streaming-tts';
+
+  /** `true` while at least one session is open and not yet closed. */
+  get isStreaming(): boolean {
+    return this._activeSessions > 0;
+  }
+
+  /** Count of sessions opened but not yet closed or cancelled. */
+  private _activeSessions = 0;
+
+  /**
+   * @param config - OpenAI provider configuration.  `apiKey` is required;
+   *   all other fields have sensible defaults.
+   */
+  constructor(private readonly config: OpenAIStreamingTTSConfig) {}
+
+  /**
+   * Open a new streaming TTS session.
+   *
+   * Provider-specific options can be forwarded via `config.providerOptions`
+   * under the following keys:
+   * - `model`        — TTS model name (default `'tts-1'`).
+   * - `voice`        — Voice preset (default `'nova'`).
+   * - `format`       — Output format (default `'opus'`).
+   * - `maxBufferMs`  — Flush timer duration ms (default `2000`).
+   *
+   * @param config - Generic session config merged with provider defaults.
+   * @returns A ready-to-use {@link OpenAITTSSession}.
+   */
+  async startSession(config?: StreamingTTSConfig): Promise<OpenAITTSSession> {
+    const provOpts = (config?.providerOptions ?? {}) as Partial<OpenAIStreamingTTSConfig>;
+
+    // Session-level overrides take precedence over factory-level config.
+    const sessionProviderConfig: OpenAIStreamingTTSConfig = {
+      ...this.config,
+      model:        provOpts.model        ?? this.config.model,
+      voice:        provOpts.voice        ?? this.config.voice,
+      format:       provOpts.format       ?? this.config.format,
+      maxBufferMs:  provOpts.maxBufferMs  ?? this.config.maxBufferMs,
+    };
+
+    const session = new OpenAITTSSession(sessionProviderConfig, config ?? {});
+
+    this._activeSessions++;
+
+    // Decrement the counter when the session terminates for any reason.
+    const onDone = (): void => {
+      this._activeSessions = Math.max(0, this._activeSessions - 1);
+    };
+    session.once('close',     onDone);
+    session.once('cancelled', onDone);
+
+    return session;
+  }
+}