Skip to content
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ scribe transcribe meeting.wav --language es # Spanish
scribe transcribe meeting.wav --language fr # French
```

### Live streaming from microphone

```bash
scribe stream # multilingual (25 languages, ~11s latency)
scribe stream --engine nemotron # English-only, low latency (~560ms), with punctuation
scribe stream --format jsonl # JSONL output (one JSON object per line)
scribe stream --output meeting.txt # save to file while streaming
```

> **Note on streaming engines:**
> - **Default** (Parakeet TDT v3): Supports all 25 languages including Spanish. Higher latency (~11s for confirmed text) because it uses a batch model in sliding windows. Live preview appears on screen while speaking.
> - **Nemotron** (`--engine nemotron`): English-only but much faster (~560ms latency). Includes punctuation and capitalization. Recommended for English-only meetings.

### Pre-download models

```bash
Expand Down Expand Up @@ -122,8 +135,10 @@ Tested on Apple Silicon (M-series):
|------|-------|---------|
| Transcription only | ~130x real-time | 4-min file in 1.7s |
| Transcription + diarization | ~30x real-time | 4-min file in 7.5s |
| Live streaming (Nemotron) | ~560ms latency | English, with punctuation |
| Live streaming (default) | ~11s latency | 25 languages |

Models are downloaded automatically on first use (~600MB for ASR, ~50MB for diarization).
Models are downloaded automatically on first use (~600MB per model). First run may take a minute.

## Requirements

Expand All @@ -138,7 +153,7 @@ Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, G

scribe is built on the shoulders of excellent open-source projects:

- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — The speech recognition model that powers transcription
- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — Speech recognition models (TDT v3 for batch, Nemotron for streaming)
- **[FluidAudio](https://github.com/FluidInference/FluidAudio)** (Apache 2.0) by FluidInference — CoreML speech processing SDK for Apple Silicon
- **[pyannote.audio](https://github.com/pyannote/pyannote-audio)** (MIT) by Herve Bredin — The diarization model architecture
- **[swift-argument-parser](https://github.com/apple/swift-argument-parser)** (Apache 2.0) by Apple — CLI argument parsing
Expand Down
317 changes: 317 additions & 0 deletions Sources/scribe/Commands/Stream.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
import ArgumentParser
import AVFoundation
import FluidAudio
import Foundation

/// Thread-safe state for tracking incremental transcript output.
private actor StreamState {
private var lastFullTranscript = ""
private var printedLength = 0

/// Given the full accumulated transcript, return only the new portion.
/// Handles model revisions by finding the longest common prefix.
func getNewText(_ fullTranscript: String) -> String? {
let text = fullTranscript.trimmingCharacters(in: .whitespaces)
guard !text.isEmpty else { return nil }
guard text != lastFullTranscript else { return nil }

lastFullTranscript = text

// Find how much of the text we've already printed
if text.count > printedLength {
let newPortion = String(text.dropFirst(printedLength)).trimmingCharacters(in: .whitespaces)
if !newPortion.isEmpty {
printedLength = text.count
return newPortion
}
}

return nil
}

/// Get the live preview (last N chars of full transcript).
func getPreview(_ fullTranscript: String, maxLen: Int = 100) -> String? {
let text = fullTranscript.trimmingCharacters(in: .whitespaces)
guard !text.isEmpty else { return nil }
guard text != lastFullTranscript else { return nil }
return String(text.suffix(maxLen))
}
}

struct Stream: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Stream live audio transcription from microphone."
)

@Option(name: .long, help: "Output format: text or jsonl.")
var format: StreamOutputFormat = .text

@Option(name: .long, help: "Streaming engine: default (multilingual, higher latency) or nemotron (English-only, low latency ~560ms).")
var engine: StreamEngine = .default

@Option(name: .shortAndLong, help: "Also save output to file.")
var output: String?

@Flag(name: .long, help: "Show status information.")
var verbose: Bool = false

func run() async throws {
switch engine {
case .nemotron:
try await runNemotron()
case .default:
try await runSlidingWindow()
}
}

// MARK: - Nemotron Engine (English-only, low latency)

private func runNemotron() async throws {
log("Initializing streaming ASR (Nemotron 560ms, English-only)...")
log("Downloading model if needed (first run only, ~600MB)...")

let engine = StreamingAsrEngineFactory.create(.nemotron560ms)

do {
try await engine.loadModels()
} catch {
log("Model loading failed: \(error.localizedDescription)")
log("Cleaning cache and retrying download...")

let cacheDir = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first!
.appendingPathComponent("FluidAudio/Models/nemotron-streaming")
try? FileManager.default.removeItem(at: cacheDir)

let freshEngine = StreamingAsrEngineFactory.create(.nemotron560ms)
try await freshEngine.loadModels()
log("Retry successful.")
try await runNemotronWithEngine(freshEngine)
return
}

log("Models loaded.")
try await runNemotronWithEngine(engine)
}

private func runNemotronWithEngine(_ engine: any StreamingAsrEngine) async throws {
let audioEngine = AVAudioEngine()
let inputNode = audioEngine.inputNode
let inputFormat = inputNode.outputFormat(forBus: 0)

if verbose {
log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch")
}

let startTime = Date()
let state = StreamState()
let fmt = format
var outputFile: FileHandle? = nil

if let outputPath = output {
FileManager.default.createFile(atPath: outputPath, contents: nil)
outputFile = FileHandle(forWritingAtPath: outputPath)
}

// Capture file handle for sendable closure
nonisolated(unsafe) let outFile = outputFile

// Partial callback — fires after each 560ms chunk with the full accumulated transcript.
// We diff to find new text and emit only the delta.
await engine.setPartialTranscriptCallback { fullTranscript in
Task {
let elapsed = Date().timeIntervalSince(startTime)

switch fmt {
case .text:
// Show live preview on stderr (ephemeral, overwritten)
let preview = String(fullTranscript.trimmingCharacters(in: .whitespaces).suffix(100))
if !preview.isEmpty {
let ts = formatStreamTimestamp(elapsed)
FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8))
}
case .jsonl:
break
}

// Emit new portion to stdout
if let newText = await state.getNewText(fullTranscript) {
let ts = formatStreamTimestamp(elapsed)

let line: String
switch fmt {
case .text:
FileHandle.standardError.write(Data("\r\u{1B}[K".utf8))
line = "[\(ts)] \(newText)"
case .jsonl:
let jsonObj: [String: Any] = [
"time": round(elapsed * 10) / 10,
"text": newText,
]
if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]),
let jsonStr = String(data: data, encoding: .utf8) {
line = jsonStr
} else {
return
}
}

print(line)
fflush(stdout)

if let file = outFile {
file.write(Data((line + "\n").utf8))
}
}
}
}

setupSignalHandler()

inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in
nonisolated(unsafe) let buf = buffer
do { try engine.appendAudio(buf) } catch {}
}

try audioEngine.start()
log("Listening (English, low latency)... press Ctrl+C to stop")

// Processing loop — drives the engine to process buffered audio
while true {
try await engine.processBufferedAudio()
try await Task.sleep(nanoseconds: 50_000_000) // 50ms
}
}

// MARK: - SlidingWindow Engine (Multilingual, default)

private func runSlidingWindow() async throws {
log("Initializing streaming ASR (Parakeet TDT v3, multilingual)...")
log("Downloading model if needed (first run only, ~600MB)...")

let streamManager = SlidingWindowAsrManager(config: .streaming)
try await streamManager.start(source: .microphone)

log("Models loaded.")

let audioEngine = AVAudioEngine()
let inputNode = audioEngine.inputNode
let inputFormat = inputNode.outputFormat(forBus: 0)

if verbose {
log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch")
}

let startTime = Date()
var outputFile: FileHandle? = nil
var lastConfirmedText = ""
var lastVolatileText = ""

if let outputPath = output {
FileManager.default.createFile(atPath: outputPath, contents: nil)
outputFile = FileHandle(forWritingAtPath: outputPath)
}

setupSignalHandler()

inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in
nonisolated(unsafe) let buf = buffer
streamManager.streamAudio(buf)
}

try audioEngine.start()
log("Listening (multilingual)... press Ctrl+C to stop")

let updates = await streamManager.transcriptionUpdates
for await update in updates {
let text = update.text.trimmingCharacters(in: .whitespaces)
guard !text.isEmpty else { continue }

let elapsed = Date().timeIntervalSince(startTime)

if update.isConfirmed {
guard text != lastConfirmedText else { continue }
lastConfirmedText = text
lastVolatileText = ""
emitLine(text: text, elapsed: elapsed, outputFile: outputFile)
} else {
guard text != lastVolatileText else { continue }
lastVolatileText = text

switch format {
case .text:
let ts = formatStreamTimestamp(elapsed)
let preview = String(text.suffix(100))
FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8))
case .jsonl:
break
}
}
}

audioEngine.stop()
inputNode.removeTap(onBus: 0)
_ = try await streamManager.finish()
}

// MARK: - Helpers

private func emitLine(text: String, elapsed: Double, outputFile: FileHandle?) {
let line: String

switch format {
case .text:
let ts = formatStreamTimestamp(elapsed)
FileHandle.standardError.write(Data("\r\u{1B}[K".utf8))
line = "[\(ts)] \(text)"
case .jsonl:
let jsonObj: [String: Any] = [
"time": round(elapsed * 10) / 10,
"text": text,
]
if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]),
let jsonStr = String(data: data, encoding: .utf8) {
line = jsonStr
} else {
return
}
}

print(line)
fflush(stdout)

if let file = outputFile {
file.write(Data((line + "\n").utf8))
}
}

private func setupSignalHandler() {
signal(SIGINT) { _ in
FileHandle.standardError.write(Data("\n".utf8))
Darwin.exit(0)
}
}

private func log(_ message: String) {
FileHandle.standardError.write(Data("[scribe] \(message)\n".utf8))
}
}

private func formatStreamTimestamp(_ seconds: Double) -> String {
let totalSeconds = Int(seconds)
let hours = totalSeconds / 3600
let minutes = (totalSeconds % 3600) / 60
let secs = totalSeconds % 60
if hours > 0 {
return String(format: "%02d:%02d:%02d", hours, minutes, secs)
}
return String(format: "%02d:%02d", minutes, secs)
}

enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable {
case text, jsonl
}

enum StreamEngine: String, ExpressibleByArgument, CaseIterable, Sendable {
case `default`
case nemotron
}
2 changes: 1 addition & 1 deletion Sources/scribe/ScribeCLI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ struct ScribeCLI: AsyncParsableCommand {
commandName: "scribe",
abstract: "State-of-the-art local audio transcription with speaker diarization for macOS.",
version: "0.2.1",
subcommands: [Transcribe.self, Models.self],
subcommands: [Transcribe.self, Stream.self, Models.self],
defaultSubcommand: Transcribe.self
)
}
Loading