framersai
diff --git a/‎registry/curated/voice/diarization/SKILL.md‎
Lines changed: 71 additions & 0 deletions b/‎registry/curated/voice/diarization/SKILL.md‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎registry/curated/voice/diarization/manifest.json‎
Lines changed: 8 additions & 0 deletions b/‎registry/curated/voice/diarization/manifest.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎registry/curated/voice/diarization/package.json‎
Lines changed: 31 additions & 0 deletions b/‎registry/curated/voice/diarization/package.json‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎registry/curated/voice/diarization/src/ClusteringStrategy.ts‎
Lines changed: 139 additions & 0 deletions b/‎registry/curated/voice/diarization/src/ClusteringStrategy.ts‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎registry/curated/voice/diarization/src/DiarizationEngine.ts‎
Lines changed: 94 additions & 0 deletions b/‎registry/curated/voice/diarization/src/DiarizationEngine.ts‎
Lines changed: 94 additions & 0 deletions
@@ -0,0 +1,71 @@
+---
+name: diarization
+description: Speaker diarization — identifies and tracks who is speaking at each moment in an audio stream
+category: voice
+---
+
+# Diarization Extension Pack
+
+Speaker diarization for the AgentOS voice pipeline.  Supports two modes:
+
+1. **Provider-delegated** — extracts speaker labels from STT provider word-level results (e.g. Deepgram with `diarize: true`).  Zero additional compute, no voiceprint model needed.
+2. **Local clustering** — uses a sliding-window spectral-centroid voiceprint with agglomerative clustering, fully offline.  An ONNX x-vector model can be plugged in later without API changes.
+
+## Setup
+
+No API key required for local mode.  For provider mode, enable diarization on your STT provider (e.g. `providerOptions.diarize: true` on the Deepgram STT pack).
+
+## Configuration
+
+```json
+{
+  "voice": {
+    "diarization": "local"
+  }
+}
+```
+
+To use provider-delegated diarization:
+
+```json
+{
+  "voice": {
+    "diarization": "provider",
+    "stt": "deepgram",
+    "providerOptions": { "diarize": true }
+  }
+}
+```
+
+### Speaker enrollment (optional)
+
+Pre-register known speakers so the engine can label them by name instead of `Speaker_N`:
+
+```ts
+await session.enrollSpeaker('Alice', aliceVoiceprintFloat32Array);
+await session.enrollSpeaker('Bob', bobVoiceprintFloat32Array);
+```
+
+## Events
+
+| Event                 | Payload              | Description                                              |
+|-----------------------|----------------------|----------------------------------------------------------|
+| `speaker_identified`  | `SpeakerIdentified`  | Emitted whenever the active speaker label changes        |
+| `segment_ready`       | `DiarizedSegment`    | A labelled audio or transcript segment is ready          |
+| `error`               | `Error`              | Unrecoverable diarization error                          |
+| `close`               | —                    | Session fully terminated                                 |
+
+## Local backend feature extraction
+
+The built-in feature extractor computes a 16-dimensional vector per 1.5 s window (0.5 s overlap):
+
+- Dimensions 0–3: octave-band RMS energy (sub-bass, bass, mid, high)
+- Dimensions 4–7: spectral centroid per octave band
+- Dimensions 8–11: zero-crossing rate per octave band
+- Dimensions 12–15: delta energy (frame-to-frame change) per octave band
+
+This is intentionally lightweight.  Replace `LocalDiarizationBackend.extractSimpleEmbedding()` with an ONNX x-vector model for production-quality voiceprints.
+
+## Clustering
+
+`ClusteringStrategy` runs agglomerative merging whenever the centroid count exceeds `expectedSpeakers`.  Centroids with cosine similarity above `mergeThreshold` (default 0.85) are collapsed into a single speaker identity.
@@ -0,0 +1,8 @@
+{
+  "name": "@framers/agentos-ext-diarization",
+  "version": "0.1.0",
+  "description": "Speaker diarization with provider delegation and local spectral-centroid clustering",
+  "kind": "diarization-provider",
+  "extensionId": "diarization",
+  "entryPoint": "./dist/index.js"
+}
@@ -0,0 +1,31 @@
+{
+  "name": "@framers/agentos-ext-diarization",
+  "version": "0.1.0",
+  "description": "Speaker diarization extension pack for AgentOS voice pipeline — provider delegation and local clustering",
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": { ".": { "import": "./dist/index.js", "types": "./dist/index.d.ts" } },
+  "files": ["dist", "src", "SKILL.md", "manifest.json"],
+  "scripts": { "build": "tsc -p tsconfig.json", "test": "vitest run" },
+  "peerDependencies": {
+    "@framers/agentos": "^0.1.0",
+    "onnxruntime-node": "^1.18.0"
+  },
+  "peerDependenciesMeta": {
+    "onnxruntime-node": { "optional": true }
+  },
+  "devDependencies": {
+    "@framers/agentos": "workspace:*",
+    "typescript": "^5.5.0",
+    "vitest": "^1.6.0"
+  },
+  "license": "MIT",
+  "author": "Frame.dev",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/framersai/agentos-extensions.git",
+    "directory": "registry/curated/voice/diarization"
+  },
+  "publishConfig": { "access": "public" }
+}
@@ -0,0 +1,139 @@
+/**
+ * @file ClusteringStrategy.ts
+ * @description Agglomerative clustering to merge drifted speaker centroids.
+ *
+ * Over the course of a long session, a single speaker's vocal characteristics
+ * may drift enough that two separate centroids are created for them.
+ * {@link ClusteringStrategy.mergeClusters} detects this by computing pairwise
+ * cosine similarity between all centroids and iteratively merging the closest
+ * pair until either no pair exceeds the merge threshold or the centroid count
+ * equals `expectedSpeakers`.
+ *
+ * @module diarization/ClusteringStrategy
+ */
+
+import { cosineSimilarity } from './SpeakerEmbeddingCache.js';
+
+// ---------------------------------------------------------------------------
+// Main class
+// ---------------------------------------------------------------------------
+
+/**
+ * Agglomerative speaker-centroid merging strategy.
+ *
+ * This is an optional post-processing step applied by {@link DiarizationSession}
+ * when the number of tracked centroids exceeds the expected speaker count.
+ *
+ * @example
+ * ```ts
+ * const strategy = new ClusteringStrategy(0.85);
+ * const mapping = strategy.mergeClusters(cache.centroids, 2);
+ * // mapping: Map<oldId, canonicalId>
+ * ```
+ */
+export class ClusteringStrategy {
+  // -------------------------------------------------------------------------
+  // Constructor
+  // -------------------------------------------------------------------------
+
+  /**
+   * @param mergeThreshold - Minimum cosine similarity between two centroids
+   *   for them to be considered the same speaker and merged.
+   *   @defaultValue 0.85
+   */
+  constructor(private readonly mergeThreshold: number = 0.85) {}
+
+  // -------------------------------------------------------------------------
+  // Public API
+  // -------------------------------------------------------------------------
+
+  /**
+   * Identify centroid pairs that should be merged and return a renaming map.
+   *
+   * The algorithm:
+   * 1. Compute all pairwise cosine similarities.
+   * 2. If `expectedSpeakers` is set and the current count exceeds it, merge
+   *    the closest pair regardless of the threshold.
+   * 3. Otherwise merge pairs that exceed `mergeThreshold`.
+   * 4. Repeat until no further merges are possible or the count matches
+   *    `expectedSpeakers`.
+   *
+   * The returned `Map<string, string>` maps every old centroid ID that was
+   * subsumed into a canonical ID.  IDs that were not merged are not present in
+   * the map.  Callers should rename all occurrences of a key to its value.
+   *
+   * @param centroids - Current centroid snapshot (id → embedding).
+   * @param expectedSpeakers - Optional upper bound on speaker count.
+   * @returns Rename map: `oldId → canonicalId`.
+   */
+  mergeClusters(
+    centroids: Map<string, Float32Array>,
+    expectedSpeakers?: number,
+  ): Map<string, string> {
+    // Build a mutable working copy so we can iteratively merge.
+    const working = new Map<string, Float32Array>(centroids);
+    // Accumulated rename mapping.
+    const renameMap = new Map<string, string>();
+
+    while (true) {
+      const ids = Array.from(working.keys());
+      const count = ids.length;
+
+      // Nothing to merge.
+      if (count < 2) break;
+
+      // Find the closest pair.
+      let bestSim = -Infinity;
+      let bestI = 0;
+      let bestJ = 1;
+
+      for (let i = 0; i < count; i++) {
+        for (let j = i + 1; j < count; j++) {
+          const sim = cosineSimilarity(working.get(ids[i]!)!, working.get(ids[j]!)!);
+          if (sim > bestSim) {
+            bestSim = sim;
+            bestI = i;
+            bestJ = j;
+          }
+        }
+      }
+
+      // Decide whether to merge.
+      const shouldMergeDueToThreshold = bestSim >= this.mergeThreshold;
+      const shouldMergeDueToCount =
+        expectedSpeakers !== undefined && count > expectedSpeakers;
+
+      if (!shouldMergeDueToThreshold && !shouldMergeDueToCount) break;
+
+      // Merge ids[bestJ] into ids[bestI] (keep the lexicographically earlier
+      // ID as the canonical one for determinism).
+      const keepId = ids[bestI]!;
+      const dropId = ids[bestJ]!;
+
+      // Average the two centroids (equal weight — simple heuristic).
+      const keepEmb = working.get(keepId)!;
+      const dropEmb = working.get(dropId)!;
+      const merged = new Float32Array(keepEmb.length);
+      for (let k = 0; k < keepEmb.length; k++) {
+        merged[k] = (keepEmb[k]! + dropEmb[k]!) / 2;
+      }
+
+      working.set(keepId, merged);
+      working.delete(dropId);
+
+      // Record the rename. Chase any existing mappings so the final map is
+      // transitively resolved.
+      renameMap.set(dropId, keepId);
+
+      // Resolve transitive renames: if dropId was itself a canonical target of
+      // an earlier merge, update those entries to point to keepId.
+      for (const [old, target] of renameMap) {
+        if (target === dropId) {
+          renameMap.set(old, keepId);
+        }
+      }
+    }
+
+    return renameMap;
+  }
+}
@@ -0,0 +1,94 @@
+/**
+ * @file DiarizationEngine.ts
+ * @description Factory that creates {@link DiarizationSession} instances,
+ * selecting the appropriate backend based on the supplied configuration.
+ *
+ * When a provider backend is requested (`config.backend === 'provider'`), the
+ * engine creates a {@link ProviderDiarizationBackend} and wraps it in a
+ * session.  Otherwise it constructs the full local pipeline:
+ * {@link SlidingWindowExtractor} → {@link LocalDiarizationBackend} →
+ * {@link SpeakerEmbeddingCache}.
+ *
+ * @module diarization/DiarizationEngine
+ */
+
+import type { IDiarizationEngine, IDiarizationSession, DiarizationConfig } from './types.js';
+import { DiarizationSession } from './DiarizationSession.js';
+import { SpeakerEmbeddingCache } from './SpeakerEmbeddingCache.js';
+import { SlidingWindowExtractor } from './SlidingWindowExtractor.js';
+import { LocalDiarizationBackend } from './LocalDiarizationBackend.js';
+import { ProviderDiarizationBackend } from './ProviderDiarizationBackend.js';
+
+// ---------------------------------------------------------------------------
+// Default values
+// ---------------------------------------------------------------------------
+
+const DEFAULT_SIMILARITY_THRESHOLD = 0.7;
+const DEFAULT_CHUNK_SIZE_MS = 1500;
+const DEFAULT_OVERLAP_MS = 500;
+const DEFAULT_SAMPLE_RATE = 16_000;
+
+// ---------------------------------------------------------------------------
+// Main class
+// ---------------------------------------------------------------------------
+
+/**
+ * Factory for diarization sessions.
+ *
+ * @example
+ * ```ts
+ * const engine = new DiarizationEngine();
+ *
+ * // Local mode (default)
+ * const session = engine.startSession();
+ * session.on('speaker_identified', ({ speakerId }) => console.log(speakerId));
+ *
+ * // Provider mode
+ * const providerSession = engine.startSession({ backend: 'provider' });
+ * sttSession.on('transcript', (e) => providerSession.labelTranscript(e));
+ * ```
+ */
+export class DiarizationEngine implements IDiarizationEngine {
+  // -------------------------------------------------------------------------
+  // IDiarizationEngine implementation
+  // -------------------------------------------------------------------------
+
+  /**
+   * Create and return a new {@link IDiarizationSession}.
+   *
+   * The backend is selected based on `config.backend`:
+   * - `'provider'` — use {@link ProviderDiarizationBackend}
+   * - `'local'` (default) — use {@link LocalDiarizationBackend} with a fresh
+   *   {@link SpeakerEmbeddingCache} and {@link SlidingWindowExtractor}
+   *
+   * @param config - Optional session configuration.
+   * @returns A ready-to-use {@link DiarizationSession}.
+   */
+  startSession(config: DiarizationConfig = {}): IDiarizationSession {
+    const backend = config.backend ?? 'local';
+
+    if (backend === 'provider') {
+      return new DiarizationSession({
+        kind: 'provider',
+        backend: new ProviderDiarizationBackend(),
+      });
+    }
+
+    // Local mode — wire up the full pipeline.
+    const threshold = config.similarityThreshold ?? DEFAULT_SIMILARITY_THRESHOLD;
+    const chunkSizeMs = config.chunkSizeMs ?? DEFAULT_CHUNK_SIZE_MS;
+    const overlapMs = config.overlapMs ?? DEFAULT_OVERLAP_MS;
+    const sampleRate = config.sampleRate ?? DEFAULT_SAMPLE_RATE;
+
+    const cache = new SpeakerEmbeddingCache(threshold);
+    const extractor = new SlidingWindowExtractor(chunkSizeMs, overlapMs, sampleRate);
+    const localBackend = new LocalDiarizationBackend(cache, extractor);
+
+    return new DiarizationSession({
+      kind: 'local',
+      backend: localBackend,
+      cache,
+      extractor,
+    });
+  }
+}