Skip to content

Commit 02ffa32

Browse files
committed
feat(ext): add diarization extension pack with provider delegation and local clustering
1 parent 22216fc commit 02ffa32

17 files changed

+2029
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
---
2+
name: diarization
3+
description: Speaker diarization — identifies and tracks who is speaking at each moment in an audio stream
4+
category: voice
5+
---
6+
7+
# Diarization Extension Pack
8+
9+
Speaker diarization for the AgentOS voice pipeline. Supports two modes:
10+
11+
1. **Provider-delegated** — extracts speaker labels from STT provider word-level results (e.g. Deepgram with `diarize: true`). Zero additional compute, no voiceprint model needed.
12+
2. **Local clustering** — uses a sliding-window spectral-centroid voiceprint with agglomerative clustering, fully offline. An ONNX x-vector model can be plugged in later without API changes.
13+
14+
## Setup
15+
16+
No API key required for local mode. For provider mode, enable diarization on your STT provider (e.g. `providerOptions.diarize: true` on the Deepgram STT pack).
17+
18+
## Configuration
19+
20+
```json
21+
{
22+
"voice": {
23+
"diarization": "local"
24+
}
25+
}
26+
```
27+
28+
To use provider-delegated diarization:
29+
30+
```json
31+
{
32+
"voice": {
33+
"diarization": "provider",
34+
"stt": "deepgram",
35+
"providerOptions": { "diarize": true }
36+
}
37+
}
38+
```
39+
40+
### Speaker enrollment (optional)
41+
42+
Pre-register known speakers so the engine can label them by name instead of `Speaker_N`:
43+
44+
```ts
45+
await session.enrollSpeaker('Alice', aliceVoiceprintFloat32Array);
46+
await session.enrollSpeaker('Bob', bobVoiceprintFloat32Array);
47+
```
48+
49+
## Events
50+
51+
| Event | Payload | Description |
52+
|-----------------------|----------------------|----------------------------------------------------------|
53+
| `speaker_identified` | `SpeakerIdentified` | Emitted whenever the active speaker label changes |
54+
| `segment_ready` | `DiarizedSegment` | A labelled audio or transcript segment is ready |
55+
| `error` | `Error` | Unrecoverable diarization error |
56+
| `close` || Session fully terminated |
57+
58+
## Local backend feature extraction
59+
60+
The built-in feature extractor computes a 16-dimensional vector per 1.5 s window (0.5 s overlap):
61+
62+
- Dimensions 0–3: octave-band RMS energy (sub-bass, bass, mid, high)
63+
- Dimensions 4–7: spectral centroid per octave band
64+
- Dimensions 8–11: zero-crossing rate per octave band
65+
- Dimensions 12–15: delta energy (frame-to-frame change) per octave band
66+
67+
This is intentionally lightweight. Replace `LocalDiarizationBackend.extractSimpleEmbedding()` with an ONNX x-vector model for production-quality voiceprints.
68+
69+
## Clustering
70+
71+
`ClusteringStrategy` runs agglomerative merging whenever the centroid count exceeds `expectedSpeakers`. Centroids with cosine similarity above `mergeThreshold` (default 0.85) are collapsed into a single speaker identity.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"name": "@framers/agentos-ext-diarization",
3+
"version": "0.1.0",
4+
"description": "Speaker diarization with provider delegation and local spectral-centroid clustering",
5+
"kind": "diarization-provider",
6+
"extensionId": "diarization",
7+
"entryPoint": "./dist/index.js"
8+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"name": "@framers/agentos-ext-diarization",
3+
"version": "0.1.0",
4+
"description": "Speaker diarization extension pack for AgentOS voice pipeline — provider delegation and local clustering",
5+
"type": "module",
6+
"main": "./dist/index.js",
7+
"types": "./dist/index.d.ts",
8+
"exports": { ".": { "import": "./dist/index.js", "types": "./dist/index.d.ts" } },
9+
"files": ["dist", "src", "SKILL.md", "manifest.json"],
10+
"scripts": { "build": "tsc -p tsconfig.json", "test": "vitest run" },
11+
"peerDependencies": {
12+
"@framers/agentos": "^0.1.0",
13+
"onnxruntime-node": "^1.18.0"
14+
},
15+
"peerDependenciesMeta": {
16+
"onnxruntime-node": { "optional": true }
17+
},
18+
"devDependencies": {
19+
"@framers/agentos": "workspace:*",
20+
"typescript": "^5.5.0",
21+
"vitest": "^1.6.0"
22+
},
23+
"license": "MIT",
24+
"author": "Frame.dev",
25+
"repository": {
26+
"type": "git",
27+
"url": "https://github.com/framersai/agentos-extensions.git",
28+
"directory": "registry/curated/voice/diarization"
29+
},
30+
"publishConfig": { "access": "public" }
31+
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/**
2+
* @file ClusteringStrategy.ts
3+
* @description Agglomerative clustering to merge drifted speaker centroids.
4+
*
5+
* Over the course of a long session, a single speaker's vocal characteristics
6+
* may drift enough that two separate centroids are created for them.
7+
* {@link ClusteringStrategy.mergeClusters} detects this by computing pairwise
8+
* cosine similarity between all centroids and iteratively merging the closest
9+
* pair until either no pair exceeds the merge threshold or the centroid count
10+
* equals `expectedSpeakers`.
11+
*
12+
* @module diarization/ClusteringStrategy
13+
*/
14+
15+
import { cosineSimilarity } from './SpeakerEmbeddingCache.js';
16+
17+
// ---------------------------------------------------------------------------
18+
// Main class
19+
// ---------------------------------------------------------------------------
20+
21+
/**
22+
* Agglomerative speaker-centroid merging strategy.
23+
*
24+
* This is an optional post-processing step applied by {@link DiarizationSession}
25+
* when the number of tracked centroids exceeds the expected speaker count.
26+
*
27+
* @example
28+
* ```ts
29+
* const strategy = new ClusteringStrategy(0.85);
30+
* const mapping = strategy.mergeClusters(cache.centroids, 2);
31+
* // mapping: Map<oldId, canonicalId>
32+
* ```
33+
*/
34+
export class ClusteringStrategy {
35+
// -------------------------------------------------------------------------
36+
// Constructor
37+
// -------------------------------------------------------------------------
38+
39+
/**
40+
* @param mergeThreshold - Minimum cosine similarity between two centroids
41+
* for them to be considered the same speaker and merged.
42+
* @defaultValue 0.85
43+
*/
44+
constructor(private readonly mergeThreshold: number = 0.85) {}
45+
46+
// -------------------------------------------------------------------------
47+
// Public API
48+
// -------------------------------------------------------------------------
49+
50+
/**
51+
* Identify centroid pairs that should be merged and return a renaming map.
52+
*
53+
* The algorithm:
54+
* 1. Compute all pairwise cosine similarities.
55+
* 2. If `expectedSpeakers` is set and the current count exceeds it, merge
56+
* the closest pair regardless of the threshold.
57+
* 3. Otherwise merge pairs that exceed `mergeThreshold`.
58+
* 4. Repeat until no further merges are possible or the count matches
59+
* `expectedSpeakers`.
60+
*
61+
* The returned `Map<string, string>` maps every old centroid ID that was
62+
* subsumed into a canonical ID. IDs that were not merged are not present in
63+
* the map. Callers should rename all occurrences of a key to its value.
64+
*
65+
* @param centroids - Current centroid snapshot (id → embedding).
66+
* @param expectedSpeakers - Optional upper bound on speaker count.
67+
* @returns Rename map: `oldId → canonicalId`.
68+
*/
69+
mergeClusters(
70+
centroids: Map<string, Float32Array>,
71+
expectedSpeakers?: number,
72+
): Map<string, string> {
73+
// Build a mutable working copy so we can iteratively merge.
74+
const working = new Map<string, Float32Array>(centroids);
75+
// Accumulated rename mapping.
76+
const renameMap = new Map<string, string>();
77+
78+
while (true) {
79+
const ids = Array.from(working.keys());
80+
const count = ids.length;
81+
82+
// Nothing to merge.
83+
if (count < 2) break;
84+
85+
// Find the closest pair.
86+
let bestSim = -Infinity;
87+
let bestI = 0;
88+
let bestJ = 1;
89+
90+
for (let i = 0; i < count; i++) {
91+
for (let j = i + 1; j < count; j++) {
92+
const sim = cosineSimilarity(working.get(ids[i]!)!, working.get(ids[j]!)!);
93+
if (sim > bestSim) {
94+
bestSim = sim;
95+
bestI = i;
96+
bestJ = j;
97+
}
98+
}
99+
}
100+
101+
// Decide whether to merge.
102+
const shouldMergeDueToThreshold = bestSim >= this.mergeThreshold;
103+
const shouldMergeDueToCount =
104+
expectedSpeakers !== undefined && count > expectedSpeakers;
105+
106+
if (!shouldMergeDueToThreshold && !shouldMergeDueToCount) break;
107+
108+
// Merge ids[bestJ] into ids[bestI] (keep the lexicographically earlier
109+
// ID as the canonical one for determinism).
110+
const keepId = ids[bestI]!;
111+
const dropId = ids[bestJ]!;
112+
113+
// Average the two centroids (equal weight — simple heuristic).
114+
const keepEmb = working.get(keepId)!;
115+
const dropEmb = working.get(dropId)!;
116+
const merged = new Float32Array(keepEmb.length);
117+
for (let k = 0; k < keepEmb.length; k++) {
118+
merged[k] = (keepEmb[k]! + dropEmb[k]!) / 2;
119+
}
120+
121+
working.set(keepId, merged);
122+
working.delete(dropId);
123+
124+
// Record the rename. Chase any existing mappings so the final map is
125+
// transitively resolved.
126+
renameMap.set(dropId, keepId);
127+
128+
// Resolve transitive renames: if dropId was itself a canonical target of
129+
// an earlier merge, update those entries to point to keepId.
130+
for (const [old, target] of renameMap) {
131+
if (target === dropId) {
132+
renameMap.set(old, keepId);
133+
}
134+
}
135+
}
136+
137+
return renameMap;
138+
}
139+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/**
2+
* @file DiarizationEngine.ts
3+
* @description Factory that creates {@link DiarizationSession} instances,
4+
* selecting the appropriate backend based on the supplied configuration.
5+
*
6+
* When a provider backend is requested (`config.backend === 'provider'`), the
7+
* engine creates a {@link ProviderDiarizationBackend} and wraps it in a
8+
* session. Otherwise it constructs the full local pipeline:
9+
* {@link SlidingWindowExtractor} → {@link LocalDiarizationBackend} →
10+
* {@link SpeakerEmbeddingCache}.
11+
*
12+
* @module diarization/DiarizationEngine
13+
*/
14+
15+
import type { IDiarizationEngine, IDiarizationSession, DiarizationConfig } from './types.js';
16+
import { DiarizationSession } from './DiarizationSession.js';
17+
import { SpeakerEmbeddingCache } from './SpeakerEmbeddingCache.js';
18+
import { SlidingWindowExtractor } from './SlidingWindowExtractor.js';
19+
import { LocalDiarizationBackend } from './LocalDiarizationBackend.js';
20+
import { ProviderDiarizationBackend } from './ProviderDiarizationBackend.js';
21+
22+
// ---------------------------------------------------------------------------
23+
// Default values
24+
// ---------------------------------------------------------------------------
25+
26+
const DEFAULT_SIMILARITY_THRESHOLD = 0.7;
27+
const DEFAULT_CHUNK_SIZE_MS = 1500;
28+
const DEFAULT_OVERLAP_MS = 500;
29+
const DEFAULT_SAMPLE_RATE = 16_000;
30+
31+
// ---------------------------------------------------------------------------
32+
// Main class
33+
// ---------------------------------------------------------------------------
34+
35+
/**
36+
* Factory for diarization sessions.
37+
*
38+
* @example
39+
* ```ts
40+
* const engine = new DiarizationEngine();
41+
*
42+
* // Local mode (default)
43+
* const session = engine.startSession();
44+
* session.on('speaker_identified', ({ speakerId }) => console.log(speakerId));
45+
*
46+
* // Provider mode
47+
* const providerSession = engine.startSession({ backend: 'provider' });
48+
* sttSession.on('transcript', (e) => providerSession.labelTranscript(e));
49+
* ```
50+
*/
51+
export class DiarizationEngine implements IDiarizationEngine {
52+
// -------------------------------------------------------------------------
53+
// IDiarizationEngine implementation
54+
// -------------------------------------------------------------------------
55+
56+
/**
57+
* Create and return a new {@link IDiarizationSession}.
58+
*
59+
* The backend is selected based on `config.backend`:
60+
* - `'provider'` — use {@link ProviderDiarizationBackend}
61+
* - `'local'` (default) — use {@link LocalDiarizationBackend} with a fresh
62+
* {@link SpeakerEmbeddingCache} and {@link SlidingWindowExtractor}
63+
*
64+
* @param config - Optional session configuration.
65+
* @returns A ready-to-use {@link DiarizationSession}.
66+
*/
67+
startSession(config: DiarizationConfig = {}): IDiarizationSession {
68+
const backend = config.backend ?? 'local';
69+
70+
if (backend === 'provider') {
71+
return new DiarizationSession({
72+
kind: 'provider',
73+
backend: new ProviderDiarizationBackend(),
74+
});
75+
}
76+
77+
// Local mode — wire up the full pipeline.
78+
const threshold = config.similarityThreshold ?? DEFAULT_SIMILARITY_THRESHOLD;
79+
const chunkSizeMs = config.chunkSizeMs ?? DEFAULT_CHUNK_SIZE_MS;
80+
const overlapMs = config.overlapMs ?? DEFAULT_OVERLAP_MS;
81+
const sampleRate = config.sampleRate ?? DEFAULT_SAMPLE_RATE;
82+
83+
const cache = new SpeakerEmbeddingCache(threshold);
84+
const extractor = new SlidingWindowExtractor(chunkSizeMs, overlapMs, sampleRate);
85+
const localBackend = new LocalDiarizationBackend(cache, extractor);
86+
87+
return new DiarizationSession({
88+
kind: 'local',
89+
backend: localBackend,
90+
cache,
91+
extractor,
92+
});
93+
}
94+
}

0 commit comments

Comments
 (0)