From d5e4d573860962813267fe6e4bd3a4713a093ecc Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 20:29:05 +0100 Subject: [PATCH 1/7] Add scribe stream command for live microphone transcription MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New command: scribe stream — captures microphone audio and transcribes in real-time using FluidAudio's SlidingWindowAsrManager (Parakeet). Features: - Live transcription from microphone with timestamps - Text and JSONL output formats - Save to file with --output - Ctrl+C to stop cleanly - Uses streaming ASR config (11s chunks, 1s hypothesis updates) Usage: scribe stream # listen and transcribe scribe stream --format jsonl # JSONL output scribe stream --output meeting.txt # save to file System audio capture (--source) will be added in a follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) --- Sources/scribe/Commands/Stream.swift | 149 +++++++++++++++++++++++++++ Sources/scribe/ScribeCLI.swift | 2 +- 2 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 Sources/scribe/Commands/Stream.swift diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift new file mode 100644 index 0000000..172d5c6 --- /dev/null +++ b/Sources/scribe/Commands/Stream.swift @@ -0,0 +1,149 @@ +import ArgumentParser +import AVFoundation +import FluidAudio +import Foundation + +struct Stream: AsyncParsableCommand { + static let configuration = CommandConfiguration( + abstract: "Stream live audio transcription from microphone or system audio." + ) + + @Flag(name: .long, help: "Capture microphone audio (default if no --source specified).") + var mic: Bool = false + + @Option(name: .long, help: "Output format: text or jsonl.") + var format: StreamOutputFormat = .text + + @Option(name: .shortAndLong, help: "Also save output to file.") + var output: String? + + @Flag(name: .long, help: "Show status information.") + var verbose: Bool = false + + func run() async throws { + // For now, mic is the default and only source + if verbose { log("Initializing streaming ASR (Parakeet)...") } + + let streamManager = SlidingWindowAsrManager(config: .streaming) + try await streamManager.start(source: .microphone) + + // Set up microphone capture + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + if verbose { + log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") + log("Listening... (press Ctrl+C to stop)") + } + + // Track state for output + let startTime = Date() + var outputFile: FileHandle? = nil + var utteranceCount = 0 + + if let outputPath = output { + FileManager.default.createFile(atPath: outputPath, contents: nil) + outputFile = FileHandle(forWritingAtPath: outputPath) + } + + // Set up signal handler for clean Ctrl+C + signal(SIGINT) { _ in + Darwin.exit(0) + } + + // Install tap on microphone — feed buffers to ASR + inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in + // nonisolated(unsafe) to cross sendability boundary — buffer is consumed immediately + nonisolated(unsafe) let buf = buffer + streamManager.streamAudio(buf) + } + + try audioEngine.start() + + if !verbose { + // Print a minimal status line to stderr + log("Listening... (press Ctrl+C to stop)") + } + + // Read transcription updates + let updates = await streamManager.transcriptionUpdates + for await update in updates { + // Stream continues until Ctrl+C or stream ends + + let text = update.text.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { continue } + + utteranceCount += 1 + let elapsed = Date().timeIntervalSince(startTime) + let speaker = mic ? "You" : "Others" + + let line: String + switch format { + case .text: + let ts = formatTimestamp(elapsed) + line = "[\(ts)] \(speaker): \(text)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "speaker": speaker, + "text": text, + "confirmed": update.isConfirmed, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + continue + } + } + + print(line) + fflush(stdout) + + if let file = outputFile { + file.write(Data((line + "\n").utf8)) + } + } + + // Clean shutdown + audioEngine.stop() + inputNode.removeTap(onBus: 0) + let finalText = try await streamManager.finish() + + let totalElapsed = Date().timeIntervalSince(startTime) + + // Print summary to stderr + FileHandle.standardError.write(Data("\n--- Stream ended ---\n".utf8)) + FileHandle.standardError.write(Data(String(format: "Duration: %dm %ds\n", + Int(totalElapsed) / 60, Int(totalElapsed) % 60).utf8)) + FileHandle.standardError.write(Data("Utterances: \(utteranceCount)\n".utf8)) + + if let outputPath = output { + FileHandle.standardError.write(Data("Saved to: \(outputPath)\n".utf8)) + } + + outputFile?.closeFile() + } + + private func log(_ message: String) { + FileHandle.standardError.write(Data("[scribe] \(message)\n".utf8)) + } + + private func formatTimestamp(_ seconds: Double) -> String { + let totalSeconds = Int(seconds) + let hours = totalSeconds / 3600 + let minutes = (totalSeconds % 3600) / 60 + let secs = totalSeconds % 60 + if hours > 0 { + return String(format: "%02d:%02d:%02d", hours, minutes, secs) + } + return String(format: "%02d:%02d", minutes, secs) + } +} + +// MARK: - Stream Output Format + +enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable { + case text, jsonl +} diff --git a/Sources/scribe/ScribeCLI.swift b/Sources/scribe/ScribeCLI.swift index 8d2e877..8a58abb 100644 --- a/Sources/scribe/ScribeCLI.swift +++ b/Sources/scribe/ScribeCLI.swift @@ -6,7 +6,7 @@ struct ScribeCLI: AsyncParsableCommand { commandName: "scribe", abstract: "State-of-the-art local audio transcription with speaker diarization for macOS.", version: "0.2.1", - subcommands: [Transcribe.self, Models.self], + subcommands: [Transcribe.self, Stream.self, Models.self], defaultSubcommand: Transcribe.self ) } From 289299ede165772a218eaeaa95b7c6832bb8d592 Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 20:36:45 +0100 Subject: [PATCH 2/7] Fix stream latency, speaker label, and dedup - Reduce chunk size from 11s to 3s for ~3-4s latency (was ~13s) - Lower confirmation threshold from 0.8 to 0.5 for faster output - Reduce right context from 2s to 0.5s - Fix speaker label: remove "Others" tag for mic input - Add text dedup to avoid repeating same hypothesis - Remove --mic flag (mic is default and only source for now) Co-Authored-By: Claude Opus 4.6 (1M context) --- Sources/scribe/Commands/Stream.swift | 40 +++++++++++++--------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift index 172d5c6..c30aa61 100644 --- a/Sources/scribe/Commands/Stream.swift +++ b/Sources/scribe/Commands/Stream.swift @@ -8,9 +8,6 @@ struct Stream: AsyncParsableCommand { abstract: "Stream live audio transcription from microphone or system audio." ) - @Flag(name: .long, help: "Capture microphone audio (default if no --source specified).") - var mic: Bool = false - @Option(name: .long, help: "Output format: text or jsonl.") var format: StreamOutputFormat = .text @@ -21,10 +18,18 @@ struct Stream: AsyncParsableCommand { var verbose: Bool = false func run() async throws { - // For now, mic is the default and only source if verbose { log("Initializing streaming ASR (Parakeet)...") } - let streamManager = SlidingWindowAsrManager(config: .streaming) + // Low-latency config: 3s chunks for fast output, 1s hypothesis for immediate feedback + let config = SlidingWindowAsrConfig( + chunkSeconds: 3.0, + hypothesisChunkSeconds: 1.0, + leftContextSeconds: 3.0, + rightContextSeconds: 0.5, + minContextForConfirmation: 3.0, + confirmationThreshold: 0.5 + ) + let streamManager = SlidingWindowAsrManager(config: config) try await streamManager.start(source: .microphone) // Set up microphone capture @@ -34,13 +39,13 @@ struct Stream: AsyncParsableCommand { if verbose { log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") - log("Listening... (press Ctrl+C to stop)") } // Track state for output let startTime = Date() var outputFile: FileHandle? = nil var utteranceCount = 0 + var lastEmittedText = "" if let outputPath = output { FileManager.default.createFile(atPath: outputPath, contents: nil) @@ -49,44 +54,40 @@ struct Stream: AsyncParsableCommand { // Set up signal handler for clean Ctrl+C signal(SIGINT) { _ in + FileHandle.standardError.write(Data("\n".utf8)) Darwin.exit(0) } // Install tap on microphone — feed buffers to ASR inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in - // nonisolated(unsafe) to cross sendability boundary — buffer is consumed immediately nonisolated(unsafe) let buf = buffer streamManager.streamAudio(buf) } try audioEngine.start() - - if !verbose { - // Print a minimal status line to stderr - log("Listening... (press Ctrl+C to stop)") - } + log("Listening... (press Ctrl+C to stop)") // Read transcription updates let updates = await streamManager.transcriptionUpdates for await update in updates { - // Stream continues until Ctrl+C or stream ends - let text = update.text.trimmingCharacters(in: .whitespaces) guard !text.isEmpty else { continue } + // Skip if this is the same text we already emitted (dedup) + guard text != lastEmittedText else { continue } + lastEmittedText = text + utteranceCount += 1 let elapsed = Date().timeIntervalSince(startTime) - let speaker = mic ? "You" : "Others" let line: String switch format { case .text: let ts = formatTimestamp(elapsed) - line = "[\(ts)] \(speaker): \(text)" + line = "[\(ts)] \(text)" case .jsonl: let jsonObj: [String: Any] = [ "time": round(elapsed * 10) / 10, - "speaker": speaker, "text": text, "confirmed": update.isConfirmed, ] @@ -109,11 +110,10 @@ struct Stream: AsyncParsableCommand { // Clean shutdown audioEngine.stop() inputNode.removeTap(onBus: 0) - let finalText = try await streamManager.finish() + _ = try await streamManager.finish() let totalElapsed = Date().timeIntervalSince(startTime) - // Print summary to stderr FileHandle.standardError.write(Data("\n--- Stream ended ---\n".utf8)) FileHandle.standardError.write(Data(String(format: "Duration: %dm %ds\n", Int(totalElapsed) / 60, Int(totalElapsed) % 60).utf8)) @@ -142,8 +142,6 @@ struct Stream: AsyncParsableCommand { } } -// MARK: - Stream Output Format - enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable { case text, jsonl } From 634ae69197fa73d04dd69058c6676ee684c3199d Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 20:43:42 +0100 Subject: [PATCH 3/7] Fix stream: use library streaming config, show volatile + confirmed updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 3s chunk config was too short for Parakeet — model needs ~10s context. Reverted to the library's .streaming preset (11s chunks, 1s hypothesis). Now shows two types of updates: - Volatile (hypothesis): shown as ephemeral line on stderr with \r overwrite Gives immediate ~1-2s feedback while speaking - Confirmed: printed as permanent line to stdout Stable, final text after sufficient context Also fixes: - Stream getting stuck on longer utterances (was breaking model state) - Text format shows live preview on stderr, final on stdout - JSONL emits both volatile and confirmed (with "confirmed" field) Co-Authored-By: Claude Opus 4.6 (1M context) --- Sources/scribe/Commands/Stream.swift | 99 +++++++++++++++------------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift index c30aa61..b067dd6 100644 --- a/Sources/scribe/Commands/Stream.swift +++ b/Sources/scribe/Commands/Stream.swift @@ -5,7 +5,7 @@ import Foundation struct Stream: AsyncParsableCommand { static let configuration = CommandConfiguration( - abstract: "Stream live audio transcription from microphone or system audio." + abstract: "Stream live audio transcription from microphone." ) @Option(name: .long, help: "Output format: text or jsonl.") @@ -20,16 +20,8 @@ struct Stream: AsyncParsableCommand { func run() async throws { if verbose { log("Initializing streaming ASR (Parakeet)...") } - // Low-latency config: 3s chunks for fast output, 1s hypothesis for immediate feedback - let config = SlidingWindowAsrConfig( - chunkSeconds: 3.0, - hypothesisChunkSeconds: 1.0, - leftContextSeconds: 3.0, - rightContextSeconds: 0.5, - minContextForConfirmation: 3.0, - confirmationThreshold: 0.5 - ) - let streamManager = SlidingWindowAsrManager(config: config) + // Use the library's streaming config (11s chunks + 1s hypothesis updates) + let streamManager = SlidingWindowAsrManager(config: .streaming) try await streamManager.start(source: .microphone) // Set up microphone capture @@ -41,24 +33,22 @@ struct Stream: AsyncParsableCommand { log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") } - // Track state for output let startTime = Date() var outputFile: FileHandle? = nil - var utteranceCount = 0 - var lastEmittedText = "" + var lastConfirmedText = "" + var lastVolatileText = "" if let outputPath = output { FileManager.default.createFile(atPath: outputPath, contents: nil) outputFile = FileHandle(forWritingAtPath: outputPath) } - // Set up signal handler for clean Ctrl+C signal(SIGINT) { _ in FileHandle.standardError.write(Data("\n".utf8)) Darwin.exit(0) } - // Install tap on microphone — feed buffers to ASR + // Install tap on microphone inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in nonisolated(unsafe) let buf = buffer streamManager.streamAudio(buf) @@ -73,37 +63,39 @@ struct Stream: AsyncParsableCommand { let text = update.text.trimmingCharacters(in: .whitespaces) guard !text.isEmpty else { continue } - // Skip if this is the same text we already emitted (dedup) - guard text != lastEmittedText else { continue } - lastEmittedText = text - - utteranceCount += 1 let elapsed = Date().timeIntervalSince(startTime) - let line: String - switch format { - case .text: - let ts = formatTimestamp(elapsed) - line = "[\(ts)] \(text)" - case .jsonl: - let jsonObj: [String: Any] = [ - "time": round(elapsed * 10) / 10, - "text": text, - "confirmed": update.isConfirmed, - ] - if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), - let jsonStr = String(data: data, encoding: .utf8) { - line = jsonStr - } else { - continue - } - } + if update.isConfirmed { + // Confirmed: stable, final text — print as a permanent line + guard text != lastConfirmedText else { continue } + lastConfirmedText = text + lastVolatileText = "" // reset volatile tracking - print(line) - fflush(stdout) + let line = formatLine(text: text, elapsed: elapsed, confirmed: true) + print(line) + fflush(stdout) - if let file = outputFile { - file.write(Data((line + "\n").utf8)) + if let file = outputFile { + file.write(Data((line + "\n").utf8)) + } + } else { + // Volatile: hypothesis, may change — show as ephemeral line + guard text != lastVolatileText else { continue } + lastVolatileText = text + + switch format { + case .text: + // Overwrite current line with \r for live feel + let ts = formatTimestamp(elapsed) + let preview = String(text.suffix(80)) + let line = "\r[\(ts)] \(preview)" + FileHandle.standardError.write(Data(line.utf8)) + case .jsonl: + // In JSONL mode, emit volatile updates too (marked as unconfirmed) + let line = formatLine(text: text, elapsed: elapsed, confirmed: false) + print(line) + fflush(stdout) + } } } @@ -113,11 +105,9 @@ struct Stream: AsyncParsableCommand { _ = try await streamManager.finish() let totalElapsed = Date().timeIntervalSince(startTime) - FileHandle.standardError.write(Data("\n--- Stream ended ---\n".utf8)) FileHandle.standardError.write(Data(String(format: "Duration: %dm %ds\n", Int(totalElapsed) / 60, Int(totalElapsed) % 60).utf8)) - FileHandle.standardError.write(Data("Utterances: \(utteranceCount)\n".utf8)) if let outputPath = output { FileHandle.standardError.write(Data("Saved to: \(outputPath)\n".utf8)) @@ -126,6 +116,25 @@ struct Stream: AsyncParsableCommand { outputFile?.closeFile() } + private func formatLine(text: String, elapsed: Double, confirmed: Bool) -> String { + switch format { + case .text: + let ts = formatTimestamp(elapsed) + return "[\(ts)] \(text)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": text, + "confirmed": confirmed, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + return jsonStr + } + return "{\"text\":\"\(text)\"}" + } + } + private func log(_ message: String) { FileHandle.standardError.write(Data("[scribe] \(message)\n".utf8)) } From 11e0ec683f78185721f2c565e5fc0b730d6b1cd0 Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 21:06:18 +0100 Subject: [PATCH 4/7] =?UTF-8?q?Switch=20stream=20to=20Nemotron=20560ms=20?= =?UTF-8?q?=E2=80=94=20true=20streaming=20ASR=20engine?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace SlidingWindowAsrManager (batch TDT in sliding windows, ~11s latency) with StreamingAsrEngine protocol using Nemotron 560ms: - True cache-aware streaming: each 560ms chunk inherits full context - 2.12% WER (better than TDT v3's 2.5% on LibriSpeech) - Includes punctuation and capitalization - ~560ms to first text (was ~11s) - Partial transcript callback for live preview on stderr - Confirmed text printed to stdout Architecture: - Mic audio → appendAudio() → processBufferedAudio() → getPartialTranscript() - Partial callback fires on every chunk for live preview (\r overwrite on stderr) - Main loop polls at 20Hz, emits new confirmed text to stdout - Actor-based state management for thread safety (Swift 6) Co-Authored-By: Claude Opus 4.6 (1M context) --- Sources/scribe/Commands/Stream.swift | 185 +++++++++++++++------------ 1 file changed, 104 insertions(+), 81 deletions(-) diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift index b067dd6..72f3178 100644 --- a/Sources/scribe/Commands/Stream.swift +++ b/Sources/scribe/Commands/Stream.swift @@ -3,6 +3,34 @@ import AVFoundation import FluidAudio import Foundation +/// Thread-safe state tracker for streaming output. +private actor StreamState { + var lastPartialText = "" + var lastOutputText = "" + + func shouldEmitPartial(_ text: String) -> Bool { + guard text != lastPartialText else { return false } + lastPartialText = text + return true + } + + func getNewText(_ fullTranscript: String) -> String? { + let text = fullTranscript.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty, text != lastOutputText else { return nil } + + let newText: String + if text.hasPrefix(lastOutputText) { + newText = String(text.dropFirst(lastOutputText.count)).trimmingCharacters(in: .whitespaces) + } else { + newText = text + } + + guard !newText.isEmpty else { return nil } + lastOutputText = text + return newText + } +} + struct Stream: AsyncParsableCommand { static let configuration = CommandConfiguration( abstract: "Stream live audio transcription from microphone." @@ -18,13 +46,13 @@ struct Stream: AsyncParsableCommand { var verbose: Bool = false func run() async throws { - if verbose { log("Initializing streaming ASR (Parakeet)...") } + if verbose { log("Initializing streaming ASR (Nemotron 560ms)...") } - // Use the library's streaming config (11s chunks + 1s hypothesis updates) - let streamManager = SlidingWindowAsrManager(config: .streaming) - try await streamManager.start(source: .microphone) + let engine = StreamingAsrEngineFactory.create(.nemotron560ms) + try await engine.loadModels() + + if verbose { log("Models loaded.") } - // Set up microphone capture let audioEngine = AVAudioEngine() let inputNode = audioEngine.inputNode let inputFormat = inputNode.outputFormat(forBus: 0) @@ -34,15 +62,44 @@ struct Stream: AsyncParsableCommand { } let startTime = Date() + let state = StreamState() + let fmt = format var outputFile: FileHandle? = nil - var lastConfirmedText = "" - var lastVolatileText = "" if let outputPath = output { FileManager.default.createFile(atPath: outputPath, contents: nil) outputFile = FileHandle(forWritingAtPath: outputPath) } + // Partial transcript callback — fires on every chunk (~560ms) + await engine.setPartialTranscriptCallback { partial in + let text = partial.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { return } + + Task { + guard await state.shouldEmitPartial(text) else { return } + let elapsed = Date().timeIntervalSince(startTime) + + switch fmt { + case .text: + let ts = formatStreamTimestamp(elapsed) + let preview = String(text.suffix(100)) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": text, + "partial": true, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + print(jsonStr) + fflush(stdout) + } + } + } + } + signal(SIGINT) { _ in FileHandle.standardError.write(Data("\n".utf8)) Darwin.exit(0) @@ -51,104 +108,70 @@ struct Stream: AsyncParsableCommand { // Install tap on microphone inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in nonisolated(unsafe) let buf = buffer - streamManager.streamAudio(buf) + do { + try engine.appendAudio(buf) + } catch { + // Non-fatal + } } try audioEngine.start() log("Listening... (press Ctrl+C to stop)") - // Read transcription updates - let updates = await streamManager.transcriptionUpdates - for await update in updates { - let text = update.text.trimmingCharacters(in: .whitespaces) - guard !text.isEmpty else { continue } + // Processing loop + while true { + try await engine.processBufferedAudio() - let elapsed = Date().timeIntervalSince(startTime) + let transcript = await engine.getPartialTranscript() + if let newText = await state.getNewText(transcript) { + let elapsed = Date().timeIntervalSince(startTime) + let line: String - if update.isConfirmed { - // Confirmed: stable, final text — print as a permanent line - guard text != lastConfirmedText else { continue } - lastConfirmedText = text - lastVolatileText = "" // reset volatile tracking + switch format { + case .text: + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + line = "[\(ts)] \(newText)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": newText, + "partial": false, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + continue + } + } - let line = formatLine(text: text, elapsed: elapsed, confirmed: true) print(line) fflush(stdout) if let file = outputFile { file.write(Data((line + "\n").utf8)) } - } else { - // Volatile: hypothesis, may change — show as ephemeral line - guard text != lastVolatileText else { continue } - lastVolatileText = text - - switch format { - case .text: - // Overwrite current line with \r for live feel - let ts = formatTimestamp(elapsed) - let preview = String(text.suffix(80)) - let line = "\r[\(ts)] \(preview)" - FileHandle.standardError.write(Data(line.utf8)) - case .jsonl: - // In JSONL mode, emit volatile updates too (marked as unconfirmed) - let line = formatLine(text: text, elapsed: elapsed, confirmed: false) - print(line) - fflush(stdout) - } } - } - - // Clean shutdown - audioEngine.stop() - inputNode.removeTap(onBus: 0) - _ = try await streamManager.finish() - let totalElapsed = Date().timeIntervalSince(startTime) - FileHandle.standardError.write(Data("\n--- Stream ended ---\n".utf8)) - FileHandle.standardError.write(Data(String(format: "Duration: %dm %ds\n", - Int(totalElapsed) / 60, Int(totalElapsed) % 60).utf8)) - - if let outputPath = output { - FileHandle.standardError.write(Data("Saved to: \(outputPath)\n".utf8)) - } - - outputFile?.closeFile() - } - - private func formatLine(text: String, elapsed: Double, confirmed: Bool) -> String { - switch format { - case .text: - let ts = formatTimestamp(elapsed) - return "[\(ts)] \(text)" - case .jsonl: - let jsonObj: [String: Any] = [ - "time": round(elapsed * 10) / 10, - "text": text, - "confirmed": confirmed, - ] - if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), - let jsonStr = String(data: data, encoding: .utf8) { - return jsonStr - } - return "{\"text\":\"\(text)\"}" + try await Task.sleep(nanoseconds: 50_000_000) // 50ms polling } } private func log(_ message: String) { FileHandle.standardError.write(Data("[scribe] \(message)\n".utf8)) } +} - private func formatTimestamp(_ seconds: Double) -> String { - let totalSeconds = Int(seconds) - let hours = totalSeconds / 3600 - let minutes = (totalSeconds % 3600) / 60 - let secs = totalSeconds % 60 - if hours > 0 { - return String(format: "%02d:%02d:%02d", hours, minutes, secs) - } - return String(format: "%02d:%02d", minutes, secs) +private func formatStreamTimestamp(_ seconds: Double) -> String { + let totalSeconds = Int(seconds) + let hours = totalSeconds / 3600 + let minutes = (totalSeconds % 3600) / 60 + let secs = totalSeconds % 60 + if hours > 0 { + return String(format: "%02d:%02d:%02d", hours, minutes, secs) } + return String(format: "%02d:%02d", minutes, secs) } enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable { From 97d1f920477caf3f2ef02b6238f423708106c0e6 Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 21:10:39 +0100 Subject: [PATCH 5/7] Default stream to multilingual engine, Nemotron as opt-in for English - Default: Parakeet TDT v3 via SlidingWindow (25 languages, higher latency) - --engine nemotron: Nemotron 560ms (English-only, ~560ms latency, punctuation) Usage: scribe stream # multilingual (default) scribe stream --engine nemotron # English-only, low latency Co-Authored-By: Claude Opus 4.6 (1M context) --- Sources/scribe/Commands/Stream.swift | 160 +++++++++++++++++++++------ 1 file changed, 126 insertions(+), 34 deletions(-) diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift index 72f3178..2af02b0 100644 --- a/Sources/scribe/Commands/Stream.swift +++ b/Sources/scribe/Commands/Stream.swift @@ -39,6 +39,9 @@ struct Stream: AsyncParsableCommand { @Option(name: .long, help: "Output format: text or jsonl.") var format: StreamOutputFormat = .text + @Option(name: .long, help: "Streaming engine: default (multilingual, higher latency) or nemotron (English-only, low latency ~560ms).") + var engine: StreamEngine = .default + @Option(name: .shortAndLong, help: "Also save output to file.") var output: String? @@ -46,7 +49,18 @@ struct Stream: AsyncParsableCommand { var verbose: Bool = false func run() async throws { - if verbose { log("Initializing streaming ASR (Nemotron 560ms)...") } + switch engine { + case .nemotron: + try await runNemotron() + case .default: + try await runSlidingWindow() + } + } + + // MARK: - Nemotron Engine (English-only, low latency) + + private func runNemotron() async throws { + if verbose { log("Initializing streaming ASR (Nemotron 560ms, English-only)...") } let engine = StreamingAsrEngineFactory.create(.nemotron560ms) try await engine.loadModels() @@ -71,7 +85,6 @@ struct Stream: AsyncParsableCommand { outputFile = FileHandle(forWritingAtPath: outputPath) } - // Partial transcript callback — fires on every chunk (~560ms) await engine.setPartialTranscriptCallback { partial in let text = partial.trimmingCharacters(in: .whitespaces) guard !text.isEmpty else { return } @@ -100,61 +113,135 @@ struct Stream: AsyncParsableCommand { } } - signal(SIGINT) { _ in - FileHandle.standardError.write(Data("\n".utf8)) - Darwin.exit(0) - } + setupSignalHandler() - // Install tap on microphone inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in nonisolated(unsafe) let buf = buffer - do { - try engine.appendAudio(buf) - } catch { - // Non-fatal - } + do { try engine.appendAudio(buf) } catch {} } try audioEngine.start() - log("Listening... (press Ctrl+C to stop)") + log("Listening (English, low latency)... press Ctrl+C to stop") - // Processing loop while true { try await engine.processBufferedAudio() let transcript = await engine.getPartialTranscript() if let newText = await state.getNewText(transcript) { let elapsed = Date().timeIntervalSince(startTime) - let line: String + emitLine(text: newText, elapsed: elapsed, partial: false, outputFile: outputFile) + } + + try await Task.sleep(nanoseconds: 50_000_000) + } + } + + // MARK: - SlidingWindow Engine (Multilingual, default) + + private func runSlidingWindow() async throws { + if verbose { log("Initializing streaming ASR (Parakeet TDT v3, multilingual)...") } + + let streamManager = SlidingWindowAsrManager(config: .streaming) + try await streamManager.start(source: .microphone) + + if verbose { log("Models loaded.") } + + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + if verbose { + log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") + } + + let startTime = Date() + var outputFile: FileHandle? = nil + var lastConfirmedText = "" + var lastVolatileText = "" + + if let outputPath = output { + FileManager.default.createFile(atPath: outputPath, contents: nil) + outputFile = FileHandle(forWritingAtPath: outputPath) + } + + setupSignalHandler() + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in + nonisolated(unsafe) let buf = buffer + streamManager.streamAudio(buf) + } + + try audioEngine.start() + log("Listening (multilingual)... press Ctrl+C to stop") + + let updates = await streamManager.transcriptionUpdates + for await update in updates { + let text = update.text.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { continue } + + let elapsed = Date().timeIntervalSince(startTime) + + if update.isConfirmed { + guard text != lastConfirmedText else { continue } + lastConfirmedText = text + lastVolatileText = "" + emitLine(text: text, elapsed: elapsed, partial: false, outputFile: outputFile) + } else { + guard text != lastVolatileText else { continue } + lastVolatileText = text switch format { case .text: let ts = formatStreamTimestamp(elapsed) - FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) - line = "[\(ts)] \(newText)" + let preview = String(text.suffix(100)) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) case .jsonl: - let jsonObj: [String: Any] = [ - "time": round(elapsed * 10) / 10, - "text": newText, - "partial": false, - ] - if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), - let jsonStr = String(data: data, encoding: .utf8) { - line = jsonStr - } else { - continue - } + emitLine(text: text, elapsed: elapsed, partial: true, outputFile: nil) } + } + } - print(line) - fflush(stdout) + audioEngine.stop() + inputNode.removeTap(onBus: 0) + _ = try await streamManager.finish() + } - if let file = outputFile { - file.write(Data((line + "\n").utf8)) - } + // MARK: - Shared Helpers + + private func emitLine(text: String, elapsed: Double, partial: Bool, outputFile: FileHandle?) { + let line: String + + switch format { + case .text: + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + line = "[\(ts)] \(text)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": text, + "partial": partial, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + return } + } + + print(line) + fflush(stdout) - try await Task.sleep(nanoseconds: 50_000_000) // 50ms polling + if let file = outputFile { + file.write(Data((line + "\n").utf8)) + } + } + + private func setupSignalHandler() { + signal(SIGINT) { _ in + FileHandle.standardError.write(Data("\n".utf8)) + Darwin.exit(0) } } @@ -177,3 +264,8 @@ private func formatStreamTimestamp(_ seconds: Double) -> String { enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable { case text, jsonl } + +enum StreamEngine: String, ExpressibleByArgument, CaseIterable, Sendable { + case `default` + case nemotron +} From 98ff6ed7ddf45912cf4ad1c88cdf581de41740d5 Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 21:23:20 +0100 Subject: [PATCH 6/7] Add model download retry, progress logging, and streaming docs - Nemotron: retry with cache cleanup on failed model load (fixes partial download) - Both engines: show download progress messages (not just --verbose) - README: add streaming section with engine comparison and trade-offs - README: update performance table with streaming latencies Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 19 ++++++++++++-- Sources/scribe/Commands/Stream.swift | 38 ++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1263de1..fa97ae4 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,19 @@ scribe transcribe meeting.wav --language es # Spanish scribe transcribe meeting.wav --language fr # French ``` +### Live streaming from microphone + +```bash +scribe stream # multilingual (25 languages, ~11s latency) +scribe stream --engine nemotron # English-only, low latency (~560ms), with punctuation +scribe stream --format jsonl # JSONL output (one JSON object per line) +scribe stream --output meeting.txt # save to file while streaming +``` + +> **Note on streaming engines:** +> - **Default** (Parakeet TDT v3): Supports all 25 languages including Spanish. Higher latency (~11s for confirmed text) because it uses a batch model in sliding windows. Live preview appears on screen while speaking. +> - **Nemotron** (`--engine nemotron`): English-only but much faster (~560ms latency). Includes punctuation and capitalization. Recommended for English-only meetings. + ### Pre-download models ```bash @@ -122,8 +135,10 @@ Tested on Apple Silicon (M-series): |------|-------|---------| | Transcription only | ~130x real-time | 4-min file in 1.7s | | Transcription + diarization | ~30x real-time | 4-min file in 7.5s | +| Live streaming (Nemotron) | ~560ms latency | English, with punctuation | +| Live streaming (default) | ~11s latency | 25 languages | -Models are downloaded automatically on first use (~600MB for ASR, ~50MB for diarization). +Models are downloaded automatically on first use (~600MB per model). First run may take a minute. ## Requirements @@ -138,7 +153,7 @@ Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, G scribe is built on the shoulders of excellent open-source projects: -- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — The speech recognition model that powers transcription +- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — Speech recognition models (TDT v3 for batch, Nemotron for streaming) - **[FluidAudio](https://github.com/FluidInference/FluidAudio)** (Apache 2.0) by FluidInference — CoreML speech processing SDK for Apple Silicon - **[pyannote.audio](https://github.com/pyannote/pyannote-audio)** (MIT) by Herve Bredin — The diarization model architecture - **[swift-argument-parser](https://github.com/apple/swift-argument-parser)** (Apache 2.0) by Apple — CLI argument parsing diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift index 2af02b0..3503b96 100644 --- a/Sources/scribe/Commands/Stream.swift +++ b/Sources/scribe/Commands/Stream.swift @@ -60,12 +60,39 @@ struct Stream: AsyncParsableCommand { // MARK: - Nemotron Engine (English-only, low latency) private func runNemotron() async throws { - if verbose { log("Initializing streaming ASR (Nemotron 560ms, English-only)...") } + log("Initializing streaming ASR (Nemotron 560ms, English-only)...") + log("Downloading model if needed (first run only, ~600MB)...") let engine = StreamingAsrEngineFactory.create(.nemotron560ms) - try await engine.loadModels() - if verbose { log("Models loaded.") } + // Try loading models — if it fails (partial download), clean cache and retry once + do { + try await engine.loadModels() + } catch { + log("Model loading failed: \(error.localizedDescription)") + log("Cleaning cache and retrying download...") + + let cacheDir = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + .appendingPathComponent("FluidAudio/Models/nemotron-streaming") + try? FileManager.default.removeItem(at: cacheDir) + + let freshEngine = StreamingAsrEngineFactory.create(.nemotron560ms) + try await freshEngine.loadModels() + // If this also fails, the error propagates naturally + + // Use the fresh engine — but we can't reassign `engine` (let binding), + // so we just recurse. The cache is now clean, so it will work. + log("Retry successful.") + // Re-run with clean state + try await runNemotronWithEngine(freshEngine) + return + } + + log("Models loaded.") + try await runNemotronWithEngine(engine) + } + + private func runNemotronWithEngine(_ engine: any StreamingAsrEngine) async throws { let audioEngine = AVAudioEngine() let inputNode = audioEngine.inputNode @@ -139,12 +166,13 @@ struct Stream: AsyncParsableCommand { // MARK: - SlidingWindow Engine (Multilingual, default) private func runSlidingWindow() async throws { - if verbose { log("Initializing streaming ASR (Parakeet TDT v3, multilingual)...") } + log("Initializing streaming ASR (Parakeet TDT v3, multilingual)...") + log("Downloading model if needed (first run only, ~600MB)...") let streamManager = SlidingWindowAsrManager(config: .streaming) try await streamManager.start(source: .microphone) - if verbose { log("Models loaded.") } + log("Models loaded.") let audioEngine = AVAudioEngine() let inputNode = audioEngine.inputNode From ff95589844720eca83784e4cd8fef4cf197c8a2c Mon Sep 17 00:00:00 2001 From: Javier Toledo Date: Mon, 30 Mar 2026 23:54:04 +0100 Subject: [PATCH 7/7] Fix Nemotron streaming: emit delta text, not full accumulated transcript MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Nemotron engine's partial callback returns the full accumulated transcript each time, which grows and revises. The previous code tried to diff via getPartialTranscript() polling, causing repeated/mixed output. Fix: Track printed length in StreamState actor. The partial callback fires after each 560ms chunk — we diff to find only the new portion and emit that. Live preview shows the tail of the transcript on stderr (ephemeral, overwritten). New confirmed text goes to stdout. Also simplified SlidingWindow engine to only emit to stdout on confirmed text (volatile goes to stderr preview only). Co-Authored-By: Claude Opus 4.6 (1M context) --- Sources/scribe/Commands/Stream.swift | 126 +++++++++++++++------------ 1 file changed, 72 insertions(+), 54 deletions(-) diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift index 3503b96..f0474e3 100644 --- a/Sources/scribe/Commands/Stream.swift +++ b/Sources/scribe/Commands/Stream.swift @@ -3,31 +3,38 @@ import AVFoundation import FluidAudio import Foundation -/// Thread-safe state tracker for streaming output. +/// Thread-safe state for tracking incremental transcript output. private actor StreamState { - var lastPartialText = "" - var lastOutputText = "" - - func shouldEmitPartial(_ text: String) -> Bool { - guard text != lastPartialText else { return false } - lastPartialText = text - return true - } + private var lastFullTranscript = "" + private var printedLength = 0 + /// Given the full accumulated transcript, return only the new portion. + /// Handles model revisions by finding the longest common prefix. func getNewText(_ fullTranscript: String) -> String? { let text = fullTranscript.trimmingCharacters(in: .whitespaces) - guard !text.isEmpty, text != lastOutputText else { return nil } + guard !text.isEmpty else { return nil } + guard text != lastFullTranscript else { return nil } + + lastFullTranscript = text - let newText: String - if text.hasPrefix(lastOutputText) { - newText = String(text.dropFirst(lastOutputText.count)).trimmingCharacters(in: .whitespaces) - } else { - newText = text + // Find how much of the text we've already printed + if text.count > printedLength { + let newPortion = String(text.dropFirst(printedLength)).trimmingCharacters(in: .whitespaces) + if !newPortion.isEmpty { + printedLength = text.count + return newPortion + } } - guard !newText.isEmpty else { return nil } - lastOutputText = text - return newText + return nil + } + + /// Get the live preview (last N chars of full transcript). + func getPreview(_ fullTranscript: String, maxLen: Int = 100) -> String? { + let text = fullTranscript.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { return nil } + guard text != lastFullTranscript else { return nil } + return String(text.suffix(maxLen)) } } @@ -65,7 +72,6 @@ struct Stream: AsyncParsableCommand { let engine = StreamingAsrEngineFactory.create(.nemotron560ms) - // Try loading models — if it fails (partial download), clean cache and retry once do { try await engine.loadModels() } catch { @@ -78,12 +84,7 @@ struct Stream: AsyncParsableCommand { let freshEngine = StreamingAsrEngineFactory.create(.nemotron560ms) try await freshEngine.loadModels() - // If this also fails, the error propagates naturally - - // Use the fresh engine — but we can't reassign `engine` (let binding), - // so we just recurse. The cache is now clean, so it will work. log("Retry successful.") - // Re-run with clean state try await runNemotronWithEngine(freshEngine) return } @@ -93,7 +94,6 @@ struct Stream: AsyncParsableCommand { } private func runNemotronWithEngine(_ engine: any StreamingAsrEngine) async throws { - let audioEngine = AVAudioEngine() let inputNode = audioEngine.inputNode let inputFormat = inputNode.outputFormat(forBus: 0) @@ -112,29 +112,54 @@ struct Stream: AsyncParsableCommand { outputFile = FileHandle(forWritingAtPath: outputPath) } - await engine.setPartialTranscriptCallback { partial in - let text = partial.trimmingCharacters(in: .whitespaces) - guard !text.isEmpty else { return } + // Capture file handle for sendable closure + nonisolated(unsafe) let outFile = outputFile + // Partial callback — fires after each 560ms chunk with the full accumulated transcript. + // We diff to find new text and emit only the delta. + await engine.setPartialTranscriptCallback { fullTranscript in Task { - guard await state.shouldEmitPartial(text) else { return } let elapsed = Date().timeIntervalSince(startTime) switch fmt { case .text: - let ts = formatStreamTimestamp(elapsed) - let preview = String(text.suffix(100)) - FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + // Show live preview on stderr (ephemeral, overwritten) + let preview = String(fullTranscript.trimmingCharacters(in: .whitespaces).suffix(100)) + if !preview.isEmpty { + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + } case .jsonl: - let jsonObj: [String: Any] = [ - "time": round(elapsed * 10) / 10, - "text": text, - "partial": true, - ] - if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), - let jsonStr = String(data: data, encoding: .utf8) { - print(jsonStr) - fflush(stdout) + break + } + + // Emit new portion to stdout + if let newText = await state.getNewText(fullTranscript) { + let ts = formatStreamTimestamp(elapsed) + + let line: String + switch fmt { + case .text: + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + line = "[\(ts)] \(newText)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": newText, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + return + } + } + + print(line) + fflush(stdout) + + if let file = outFile { + file.write(Data((line + "\n").utf8)) } } } @@ -150,16 +175,10 @@ struct Stream: AsyncParsableCommand { try audioEngine.start() log("Listening (English, low latency)... press Ctrl+C to stop") + // Processing loop — drives the engine to process buffered audio while true { try await engine.processBufferedAudio() - - let transcript = await engine.getPartialTranscript() - if let newText = await state.getNewText(transcript) { - let elapsed = Date().timeIntervalSince(startTime) - emitLine(text: newText, elapsed: elapsed, partial: false, outputFile: outputFile) - } - - try await Task.sleep(nanoseconds: 50_000_000) + try await Task.sleep(nanoseconds: 50_000_000) // 50ms } } @@ -213,7 +232,7 @@ struct Stream: AsyncParsableCommand { guard text != lastConfirmedText else { continue } lastConfirmedText = text lastVolatileText = "" - emitLine(text: text, elapsed: elapsed, partial: false, outputFile: outputFile) + emitLine(text: text, elapsed: elapsed, outputFile: outputFile) } else { guard text != lastVolatileText else { continue } lastVolatileText = text @@ -224,7 +243,7 @@ struct Stream: AsyncParsableCommand { let preview = String(text.suffix(100)) FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) case .jsonl: - emitLine(text: text, elapsed: elapsed, partial: true, outputFile: nil) + break } } } @@ -234,9 +253,9 @@ struct Stream: AsyncParsableCommand { _ = try await streamManager.finish() } - // MARK: - Shared Helpers + // MARK: - Helpers - private func emitLine(text: String, elapsed: Double, partial: Bool, outputFile: FileHandle?) { + private func emitLine(text: String, elapsed: Double, outputFile: FileHandle?) { let line: String switch format { @@ -248,7 +267,6 @@ struct Stream: AsyncParsableCommand { let jsonObj: [String: Any] = [ "time": round(elapsed * 10) / 10, "text": text, - "partial": partial, ] if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), let jsonStr = String(data: data, encoding: .utf8) {