diff --git a/README.md b/README.md index f1ea8995..6ccb90c6 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ Currently, only the following plugins are supported: | [@livekit/agents-plugin-openai](https://www.npmjs.com/package/@livekit/agents-plugin-openai) | LLM, TTS, STT | | [@livekit/agents-plugin-google](https://www.npmjs.com/package/@livekit/agents-plugin-google) | LLM, TTS | | [@livekit/agents-plugin-deepgram](https://www.npmjs.com/package/@livekit/agents-plugin-deepgram) | STT, TTS | +| [@livekit/agents-plugin-hathora](https://www.npmjs.com/package/@livekit/agents-plugin-hathora) | STT, TTS | | [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS | | [@livekit/agents-plugin-cartesia](https://www.npmjs.com/package/@livekit/agents-plugin-cartesia) | TTS | | [@livekit/agents-plugin-neuphonic](https://www.npmjs.com/package/@livekit/agents-plugin-neuphonic) | TTS | diff --git a/plugins/hathora/README.md b/plugins/hathora/README.md new file mode 100644 index 00000000..a8c7a994 --- /dev/null +++ b/plugins/hathora/README.md @@ -0,0 +1,17 @@ + +# Hathora plugin for LiveKit Agents + +The Agents Framework is designed for building realtime, programmable +participants that run on servers. Use it to create conversational, multi-modal +voice agents that can see, hear, and understand. + +This package contains the Hathora plugin, which allows for voice synthesis and speech recognition. +Refer to the [documentation](https://docs.livekit.io/agents/overview/) for +information on how to use it, or browse the [API +reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_hathora.html). +See the [repository](https://github.com/livekit/agents-js) for more information +about the framework as a whole. diff --git a/plugins/hathora/api-extractor.json b/plugins/hathora/api-extractor.json new file mode 100644 index 00000000..1f75e070 --- /dev/null +++ b/plugins/hathora/api-extractor.json @@ -0,0 +1,20 @@ +/** + * Config file for API Extractor. For more info, please visit: https://api-extractor.com + */ +{ + "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json", + + /** + * Optionally specifies another JSON config file that this file extends from. This provides a way for + * standard settings to be shared across multiple projects. + * + * If the path starts with "./" or "../", the path is resolved relative to the folder of the file that contains + * the "extends" field. Otherwise, the first path segment is interpreted as an NPM package name, and will be + * resolved using NodeJS require(). + * + * SUPPORTED TOKENS: none + * DEFAULT VALUE: "" + */ + "extends": "../../api-extractor-shared.json", + "mainEntryPointFilePath": "./dist/index.d.ts" +} diff --git a/plugins/hathora/package.json b/plugins/hathora/package.json new file mode 100644 index 00000000..6e647038 --- /dev/null +++ b/plugins/hathora/package.json @@ -0,0 +1,53 @@ +{ + "name": "@livekit/agents-plugin-hathora", + "version": "0.1.0", + "description": "Hathora plugin for LiveKit Node Agents", + "main": "dist/index.js", + "require": "dist/index.cjs", + "types": "dist/index.d.ts", + "exports": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + }, + "require": { + "types": "./dist/index.d.cts", + "default": "./dist/index.cjs" + } + }, + "author": "LiveKit", + "type": "module", + "repository": "git@github.com:livekit/agents-js.git", + "license": "Apache-2.0", + "files": [ + "dist", + "src", + "README.md" + ], + "scripts": { + "build": "tsup --onSuccess \"pnpm build:types\"", + "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js", + "clean": "rm -rf dist", + "clean:build": "pnpm clean && pnpm build", + "lint": "eslint -f unix \"src/**/*.{ts,js}\"", + "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript", + "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose" + }, + "devDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/agents-plugin-openai": "workspace:*", + "@livekit/agents-plugins-test": "workspace:*", + "@livekit/rtc-node": "^0.13.12", + "@microsoft/api-extractor": "^7.35.0", + "@types/ws": "^8.5.10", + "tsup": "^8.3.5", + "typescript": "^5.0.0" + }, + "dependencies": { + "ws": "^8.16.0" + }, + "peerDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "^0.13.12" + } +} diff --git a/plugins/hathora/src/index.ts b/plugins/hathora/src/index.ts new file mode 100644 index 00000000..97a63664 --- /dev/null +++ b/plugins/hathora/src/index.ts @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Plugin } from '@livekit/agents'; + +export * from './tts.js'; +export * from './stt.js'; +export * from './utils.js'; + +class HathoraPlugin extends Plugin { + constructor() { + super({ + title: 'hathora', + version: '0.1.0', + package: '@livekit/agents-plugin-hathora', + }); + } +} + +Plugin.registerPlugin(new HathoraPlugin()); diff --git a/plugins/hathora/src/stt.ts b/plugins/hathora/src/stt.ts new file mode 100644 index 00000000..4ec060d8 --- /dev/null +++ b/plugins/hathora/src/stt.ts @@ -0,0 +1,145 @@ +import { type AudioBuffer, stt } from '@livekit/agents'; +import { combineAudioFrames, type AudioFrame } from '@livekit/rtc-node'; +import type { ConfigOption } from './utils.js'; + +const API_URL = 'https://api.models.hathora.dev/inference/v1/stt'; +const AUTHORIZATION_HEADER = 'Authorization'; + +/** + * @interface STTOptions - Options for configuring the Hathora STT service. + * @property model - Model to use; find available models [here](https://models.hathora.dev). + * @property [language] - Language code (if supported by model). + * @property [modelConfig] - Some models support additional config, refer to [docs](https://models.hathora.dev) + * for each model to see what is supported. + * @property [baseURL] - Base API URL for the Hathora STT service. + * @property [apiKey] - API key for authentication with the Hathora service; + * provision one [here](https://models.hathora.dev/tokens). + */ +export interface STTOptions { + /** Model to use; find available models [here](https://models.hathora.dev).*/ + model: string; + /** Language code (if supported by model). */ + language?: string; + /** Some models support additional config, refer to [docs](https://models.hathora.dev) + * for each model to see what is supported. */ + modelConfig?: ConfigOption[]; + /** Base API URL for the Hathora TTS service. */ + baseURL?: string; + /** API key for authentication with the Hathora service; + * provision one [here](https://models.hathora.dev/tokens). */ + apiKey?: string; +} + +const defaultSTTOptions: Partial = { + baseURL: API_URL, + apiKey: process.env.HATHORA_API_KEY, +}; + +/** + * This service supports several different speech-to-text models hosted by Hathora. + * + * [Documentation](https://models.hathora.dev) + */ +export class STT extends stt.STT { + label = 'hathora.STT'; + #opts: STTOptions; + #url: URL; + + constructor(opts: STTOptions) { + super({ streaming: false, interimResults: false }); + + this.#opts = { + ...defaultSTTOptions, + ...opts + }; + + if (opts.baseURL === undefined) { + this.#opts.baseURL = API_URL; + } + + // remove trailing slash from baseURL + const baseURL = this.#opts.baseURL!.replace(/\/$/, ''); + + this.#url = new URL(baseURL); + + if (this.#opts.apiKey === undefined) { + throw new Error('Hathora API key is required, whether as an argument or as $HATHORA_API_KEY'); + } + } + + #createWav(frame: AudioFrame): Buffer { + const bitsPerSample = 16; + const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8; + const blockAlign = (frame.channels * bitsPerSample) / 8; + + const header = Buffer.alloc(44); + header.write('RIFF', 0); + header.writeUInt32LE(36 + frame.data.byteLength, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(frame.channels, 22); + header.writeUInt32LE(frame.sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(16, 34); + header.write('data', 36); + header.writeUInt32LE(frame.data.byteLength, 40); + return Buffer.concat([header, Buffer.from(frame.data.buffer)]); + } + + async _recognize(buffer: AudioBuffer, abortSignal?: AbortSignal): Promise { + const headers: HeadersInit = { + [AUTHORIZATION_HEADER]: `Bearer ${this.#opts.apiKey!}`, + 'Content-Type': 'application/json', + }; + + let body: any = { + model: this.#opts.model, + }; + + if (this.#opts.language) { + body.language = this.#opts.language; + } + + if (this.#opts.modelConfig) { + body.model_config = this.#opts.modelConfig; + } + + body.audio = this.#createWav(combineAudioFrames(buffer)).toString('base64'); + + const response = await fetch( + this.#url, + { + method: 'POST', + headers, + body: JSON.stringify(body), + signal: abortSignal, + }, + ); + + if (!response.ok) { + throw new Error(`STT request failed: ${response.status} ${response.statusText}`); + } + + const result = await response.json(); + + return { + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: result.text || '', + language: this.#opts.language || '', + startTime: 0, + endTime: 0, + confidence: 0, + }, + ], + }; + } + + stream(): stt.SpeechStream { + throw new Error('Streaming is not supported on Hathora STT'); + } +} diff --git a/plugins/hathora/src/tts.ts b/plugins/hathora/src/tts.ts new file mode 100644 index 00000000..9d68162e --- /dev/null +++ b/plugins/hathora/src/tts.ts @@ -0,0 +1,190 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + AudioByteStream, + shortuuid, + tts, +} from '@livekit/agents'; +import { URL } from 'node:url'; +import type { ConfigOption } from './utils.js'; + +const API_URL = 'https://api.models.hathora.dev/inference/v1/tts'; +const AUTHORIZATION_HEADER = 'Authorization'; +const SAMPLE_RATE = 24000; +const NUM_CHANNELS = 1; + +/** + * @interface TTSOptions - Options for configuring the Hathora TTS service. + * @property model - Model to use; find available models [here](https://models.hathora.dev). + * @property [voice] - Voice to use for synthesis (if supported by model). + * @property [speed] - Speech speed multiplier (if supported by model). + * @property [modelConfig] - Some models support additional config, refer to [docs](https://models.hathora.dev) + * for each model to see what is supported. + * @property [baseURL] - Base API URL for the Hathora TTS service. + * @property [apiKey] - API key for authentication with the Hathora service; + * provision one [here](https://models.hathora.dev/tokens). + */ +export interface TTSOptions { + /** Model to use; find available models [here](https://models.hathora.dev).*/ + model: string; + /** Voice to use for synthesis (if supported by model). */ + voice?: string; + /** Speech speed multiplier (if supported by model). */ + speed?: number; + /** Some models support additional config, refer to [docs](https://models.hathora.dev) + * for each model to see what is supported. */ + modelConfig?: ConfigOption[]; + /** Base API URL for the Hathora TTS service. */ + baseURL?: string; + /** API key for authentication with the Hathora service; + * provision one [here](https://models.hathora.dev/tokens). */ + apiKey?: string; +} + +const defaultTTSOptions: Partial = { + baseURL: API_URL, + apiKey: process.env.HATHORA_API_KEY, +}; + +/** + * This service supports several different text-to-speech models hosted by Hathora. + * + * [Documentation](https://models.hathora.dev) + */ +export class TTS extends tts.TTS { + #opts: TTSOptions; + label = 'hathora.TTS'; + + constructor(opts: TTSOptions) { + super(SAMPLE_RATE, 1, { + streaming: false, + }); + + this.#opts = { + ...defaultTTSOptions, + ...opts, + }; + + if (this.#opts.apiKey === undefined) { + throw new Error( + 'Hathora API key is required, whether as an argument or as $HATHORA_API_KEY', + ); + } + } + + synthesize(text: string): tts.ChunkedStream { + return new ChunkedStream(this, text, this.#opts); + } + + stream(): tts.SynthesizeStream { + throw new Error('Streaming is not supported on Hathora TTS'); + } +} + +export class ChunkedStream extends tts.ChunkedStream { + label = 'hathora.ChunkedStream'; + #opts: TTSOptions; + #text: string; + #url: URL; + + // set Promise to any because OpenAI returns an annoying Response type + constructor(tts: TTS, text: string, opts: TTSOptions) { + super(text, tts); + this.#text = text; + + this.#opts = opts; + + if (opts.baseURL === undefined) { + this.#opts.baseURL = API_URL; + } + + // remove trailing slash from baseURL + const baseURL = this.#opts.baseURL!.replace(/\/$/, ''); + + this.#url = new URL(baseURL); + } + + protected async run() { + const requestId = shortuuid(); + + const headers: HeadersInit = { + [AUTHORIZATION_HEADER]: `Bearer ${this.#opts.apiKey!}`, + 'Content-Type': 'application/json', + }; + + const body: any = { + model: this.#opts.model, + text: this.#text, + }; + + if (this.#opts.voice) { + body.voice = this.#opts.voice; + } + if (this.#opts.speed) { + body.speed = this.#opts.speed; + } + if (this.#opts.modelConfig) { + body.model_config = this.#opts.modelConfig; + } + + const response = await fetch( + this.#url, + { + method: 'POST', + headers, + body: JSON.stringify(body), + }, + ); + + if (!response.ok) { + throw new Error(`TTS request failed: ${response.status} ${response.statusText}`); + } + + const arrayBuffer = await response.arrayBuffer(); + + // Convert the WAV/PCM payload into raw PCM samples to prevent clicking sounds + const rawPCM = convertWavToRawPCM(arrayBuffer); + + const bstream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS); + for (const frame of bstream.write(rawPCM)) { + this.queue.put({ + requestId, + frame, + final: false, + segmentId: requestId, + }); + } + } +} + +const convertWavToRawPCM = (wavBuffer: ArrayBuffer): ArrayBuffer => { + const dataView = new DataView(wavBuffer); + + // Check the "RIFF" chunk descriptor + if (dataView.getUint32(0, false) !== 0x52494646) { // "RIFF" + throw new Error('Invalid WAV file: Missing "RIFF" descriptor'); + } + + // Check the "WAVE" format + if (dataView.getUint32(8, false) !== 0x57415645) { // "WAVE" + throw new Error('Invalid WAV file: Missing "WAVE" format'); + } + + // Find the "data" sub-chunk + let offset = 12; + while (offset < dataView.byteLength) { + const subChunkID = dataView.getUint32(offset, false); + const subChunkSize = dataView.getUint32(offset + 4, true); + + if (subChunkID === 0x64617461) { // "data" + const dataStart = offset + 8; + const dataEnd = dataStart + subChunkSize; + return wavBuffer.slice(dataStart, dataEnd); + } + + offset += (8 + subChunkSize); + } + + throw new Error('Invalid WAV file: Missing "data" sub-chunk'); +} diff --git a/plugins/hathora/src/utils.ts b/plugins/hathora/src/utils.ts new file mode 100644 index 00000000..bb289477 --- /dev/null +++ b/plugins/hathora/src/utils.ts @@ -0,0 +1,4 @@ +export interface ConfigOption { + name: string; + value: string; +} \ No newline at end of file diff --git a/plugins/hathora/tsconfig.json b/plugins/hathora/tsconfig.json new file mode 100644 index 00000000..d3126e4d --- /dev/null +++ b/plugins/hathora/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../../tsconfig.json", + "include": ["./src"], + "compilerOptions": { + // match output dir to input dir. e.g. dist/index instead of dist/src/index + "rootDir": "./src", + "declarationDir": "./dist", + "outDir": "./dist" + }, + "typedocOptions": { + "name": "plugins/agents-plugin-hathora", + "entryPointStrategy": "resolve", + "readme": "none", + "entryPoints": ["src/index.ts"] + } +} diff --git a/plugins/hathora/tsup.config.ts b/plugins/hathora/tsup.config.ts new file mode 100644 index 00000000..8ca20961 --- /dev/null +++ b/plugins/hathora/tsup.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'tsup'; + +import defaults from '../../tsup.config'; + +export default defineConfig({ + ...defaults, +});