diff --git a/README.md b/README.md index 1263de1..fa97ae4 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,19 @@ scribe transcribe meeting.wav --language es # Spanish scribe transcribe meeting.wav --language fr # French ``` +### Live streaming from microphone + +```bash +scribe stream # multilingual (25 languages, ~11s latency) +scribe stream --engine nemotron # English-only, low latency (~560ms), with punctuation +scribe stream --format jsonl # JSONL output (one JSON object per line) +scribe stream --output meeting.txt # save to file while streaming +``` + +> **Note on streaming engines:** +> - **Default** (Parakeet TDT v3): Supports all 25 languages including Spanish. Higher latency (~11s for confirmed text) because it uses a batch model in sliding windows. Live preview appears on screen while speaking. +> - **Nemotron** (`--engine nemotron`): English-only but much faster (~560ms latency). Includes punctuation and capitalization. Recommended for English-only meetings. + ### Pre-download models ```bash @@ -122,8 +135,10 @@ Tested on Apple Silicon (M-series): |------|-------|---------| | Transcription only | ~130x real-time | 4-min file in 1.7s | | Transcription + diarization | ~30x real-time | 4-min file in 7.5s | +| Live streaming (Nemotron) | ~560ms latency | English, with punctuation | +| Live streaming (default) | ~11s latency | 25 languages | -Models are downloaded automatically on first use (~600MB for ASR, ~50MB for diarization). +Models are downloaded automatically on first use (~600MB per model). First run may take a minute. ## Requirements @@ -138,7 +153,7 @@ Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, G scribe is built on the shoulders of excellent open-source projects: -- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — The speech recognition model that powers transcription +- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — Speech recognition models (TDT v3 for batch, Nemotron for streaming) - **[FluidAudio](https://github.com/FluidInference/FluidAudio)** (Apache 2.0) by FluidInference — CoreML speech processing SDK for Apple Silicon - **[pyannote.audio](https://github.com/pyannote/pyannote-audio)** (MIT) by Herve Bredin — The diarization model architecture - **[swift-argument-parser](https://github.com/apple/swift-argument-parser)** (Apache 2.0) by Apple — CLI argument parsing diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift new file mode 100644 index 0000000..f0474e3 --- /dev/null +++ b/Sources/scribe/Commands/Stream.swift @@ -0,0 +1,317 @@ +import ArgumentParser +import AVFoundation +import FluidAudio +import Foundation + +/// Thread-safe state for tracking incremental transcript output. +private actor StreamState { + private var lastFullTranscript = "" + private var printedLength = 0 + + /// Given the full accumulated transcript, return only the new portion. + /// Handles model revisions by finding the longest common prefix. + func getNewText(_ fullTranscript: String) -> String? { + let text = fullTranscript.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { return nil } + guard text != lastFullTranscript else { return nil } + + lastFullTranscript = text + + // Find how much of the text we've already printed + if text.count > printedLength { + let newPortion = String(text.dropFirst(printedLength)).trimmingCharacters(in: .whitespaces) + if !newPortion.isEmpty { + printedLength = text.count + return newPortion + } + } + + return nil + } + + /// Get the live preview (last N chars of full transcript). + func getPreview(_ fullTranscript: String, maxLen: Int = 100) -> String? { + let text = fullTranscript.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { return nil } + guard text != lastFullTranscript else { return nil } + return String(text.suffix(maxLen)) + } +} + +struct Stream: AsyncParsableCommand { + static let configuration = CommandConfiguration( + abstract: "Stream live audio transcription from microphone." + ) + + @Option(name: .long, help: "Output format: text or jsonl.") + var format: StreamOutputFormat = .text + + @Option(name: .long, help: "Streaming engine: default (multilingual, higher latency) or nemotron (English-only, low latency ~560ms).") + var engine: StreamEngine = .default + + @Option(name: .shortAndLong, help: "Also save output to file.") + var output: String? + + @Flag(name: .long, help: "Show status information.") + var verbose: Bool = false + + func run() async throws { + switch engine { + case .nemotron: + try await runNemotron() + case .default: + try await runSlidingWindow() + } + } + + // MARK: - Nemotron Engine (English-only, low latency) + + private func runNemotron() async throws { + log("Initializing streaming ASR (Nemotron 560ms, English-only)...") + log("Downloading model if needed (first run only, ~600MB)...") + + let engine = StreamingAsrEngineFactory.create(.nemotron560ms) + + do { + try await engine.loadModels() + } catch { + log("Model loading failed: \(error.localizedDescription)") + log("Cleaning cache and retrying download...") + + let cacheDir = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + .appendingPathComponent("FluidAudio/Models/nemotron-streaming") + try? FileManager.default.removeItem(at: cacheDir) + + let freshEngine = StreamingAsrEngineFactory.create(.nemotron560ms) + try await freshEngine.loadModels() + log("Retry successful.") + try await runNemotronWithEngine(freshEngine) + return + } + + log("Models loaded.") + try await runNemotronWithEngine(engine) + } + + private func runNemotronWithEngine(_ engine: any StreamingAsrEngine) async throws { + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + if verbose { + log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") + } + + let startTime = Date() + let state = StreamState() + let fmt = format + var outputFile: FileHandle? = nil + + if let outputPath = output { + FileManager.default.createFile(atPath: outputPath, contents: nil) + outputFile = FileHandle(forWritingAtPath: outputPath) + } + + // Capture file handle for sendable closure + nonisolated(unsafe) let outFile = outputFile + + // Partial callback — fires after each 560ms chunk with the full accumulated transcript. + // We diff to find new text and emit only the delta. + await engine.setPartialTranscriptCallback { fullTranscript in + Task { + let elapsed = Date().timeIntervalSince(startTime) + + switch fmt { + case .text: + // Show live preview on stderr (ephemeral, overwritten) + let preview = String(fullTranscript.trimmingCharacters(in: .whitespaces).suffix(100)) + if !preview.isEmpty { + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + } + case .jsonl: + break + } + + // Emit new portion to stdout + if let newText = await state.getNewText(fullTranscript) { + let ts = formatStreamTimestamp(elapsed) + + let line: String + switch fmt { + case .text: + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + line = "[\(ts)] \(newText)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": newText, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + return + } + } + + print(line) + fflush(stdout) + + if let file = outFile { + file.write(Data((line + "\n").utf8)) + } + } + } + } + + setupSignalHandler() + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in + nonisolated(unsafe) let buf = buffer + do { try engine.appendAudio(buf) } catch {} + } + + try audioEngine.start() + log("Listening (English, low latency)... press Ctrl+C to stop") + + // Processing loop — drives the engine to process buffered audio + while true { + try await engine.processBufferedAudio() + try await Task.sleep(nanoseconds: 50_000_000) // 50ms + } + } + + // MARK: - SlidingWindow Engine (Multilingual, default) + + private func runSlidingWindow() async throws { + log("Initializing streaming ASR (Parakeet TDT v3, multilingual)...") + log("Downloading model if needed (first run only, ~600MB)...") + + let streamManager = SlidingWindowAsrManager(config: .streaming) + try await streamManager.start(source: .microphone) + + log("Models loaded.") + + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + if verbose { + log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") + } + + let startTime = Date() + var outputFile: FileHandle? = nil + var lastConfirmedText = "" + var lastVolatileText = "" + + if let outputPath = output { + FileManager.default.createFile(atPath: outputPath, contents: nil) + outputFile = FileHandle(forWritingAtPath: outputPath) + } + + setupSignalHandler() + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in + nonisolated(unsafe) let buf = buffer + streamManager.streamAudio(buf) + } + + try audioEngine.start() + log("Listening (multilingual)... press Ctrl+C to stop") + + let updates = await streamManager.transcriptionUpdates + for await update in updates { + let text = update.text.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { continue } + + let elapsed = Date().timeIntervalSince(startTime) + + if update.isConfirmed { + guard text != lastConfirmedText else { continue } + lastConfirmedText = text + lastVolatileText = "" + emitLine(text: text, elapsed: elapsed, outputFile: outputFile) + } else { + guard text != lastVolatileText else { continue } + lastVolatileText = text + + switch format { + case .text: + let ts = formatStreamTimestamp(elapsed) + let preview = String(text.suffix(100)) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + case .jsonl: + break + } + } + } + + audioEngine.stop() + inputNode.removeTap(onBus: 0) + _ = try await streamManager.finish() + } + + // MARK: - Helpers + + private func emitLine(text: String, elapsed: Double, outputFile: FileHandle?) { + let line: String + + switch format { + case .text: + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + line = "[\(ts)] \(text)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": text, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + return + } + } + + print(line) + fflush(stdout) + + if let file = outputFile { + file.write(Data((line + "\n").utf8)) + } + } + + private func setupSignalHandler() { + signal(SIGINT) { _ in + FileHandle.standardError.write(Data("\n".utf8)) + Darwin.exit(0) + } + } + + private func log(_ message: String) { + FileHandle.standardError.write(Data("[scribe] \(message)\n".utf8)) + } +} + +private func formatStreamTimestamp(_ seconds: Double) -> String { + let totalSeconds = Int(seconds) + let hours = totalSeconds / 3600 + let minutes = (totalSeconds % 3600) / 60 + let secs = totalSeconds % 60 + if hours > 0 { + return String(format: "%02d:%02d:%02d", hours, minutes, secs) + } + return String(format: "%02d:%02d", minutes, secs) +} + +enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable { + case text, jsonl +} + +enum StreamEngine: String, ExpressibleByArgument, CaseIterable, Sendable { + case `default` + case nemotron +} diff --git a/Sources/scribe/ScribeCLI.swift b/Sources/scribe/ScribeCLI.swift index 8d2e877..8a58abb 100644 --- a/Sources/scribe/ScribeCLI.swift +++ b/Sources/scribe/ScribeCLI.swift @@ -6,7 +6,7 @@ struct ScribeCLI: AsyncParsableCommand { commandName: "scribe", abstract: "State-of-the-art local audio transcription with speaker diarization for macOS.", version: "0.2.1", - subcommands: [Transcribe.self, Models.self], + subcommands: [Transcribe.self, Stream.self, Models.self], defaultSubcommand: Transcribe.self ) }