diff --git a/README.md b/README.md index 0298818cc24..91d464f66b6 100644 --- a/README.md +++ b/README.md @@ -731,7 +731,37 @@ let package = Package( ) ] ) -``` + +Follow these steps to try the spotlight‑style transcription overlay using the +optimized **Turbo** model on macOS: + +1. Clone the repository and build the XCFramework: + + ```bash + git clone https://github.com/ggml-org/whisper.cpp.git + cd whisper.cpp + ./build-xcframework.sh + ``` + + The script places `whisper.xcframework` inside the `build` folder. + +2. Open the `examples/WhisperSpotlight` package in Xcode: + + ```bash + open examples/WhisperSpotlight/Package.swift + ``` + +3. In Xcode, add the generated `whisper.xcframework` to the + **WhisperSpotlight** target (File → Add Files to “WhisperSpotlight” …). + +4. Build and run the app. On first launch it will download the + `ggml-large-v3-turbo.bin` model (~1.6 GB) into + `~/Library/Application Support/WhisperSpotlight/`. + +5. Press **Option‑Space** to toggle the overlay. Speak while the window shows + “Listening”. Once transcription finishes, the recognized text is copied to the + clipboard and briefly shown on screen. + ## Voice Activity Detection (VAD) Support for Voice Activity Detection (VAD) can be enabled using the `--vad` @@ -831,6 +861,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch | [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot | | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp | | [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp | +| [WhisperSpotlight](examples/WhisperSpotlight) | | macOS overlay with global hotkey for speech transcription | | [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp | | [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim | | [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture | diff --git a/examples/WhisperSpotlight/GlobalHotkey.swift b/examples/WhisperSpotlight/GlobalHotkey.swift new file mode 100644 index 00000000000..0886a76e79d --- /dev/null +++ b/examples/WhisperSpotlight/GlobalHotkey.swift @@ -0,0 +1,25 @@ +import Carbon +import Foundation + +class GlobalHotkey { + var handler: (() -> Void)? + private var ref: EventHotKeyRef? + + init(keyCode: UInt32, modifiers: UInt32) { + var hotKeyID = EventHotKeyID(signature: OSType(0x1234), id: UInt32(keyCode)) + RegisterEventHotKey(keyCode, modifiers, hotKeyID, GetApplicationEventTarget(), 0, &ref) + let eventSpec = EventTypeSpec(eventClass: OSType(kEventClassKeyboard), eventKind: UInt32(kEventHotKeyPressed)) + InstallEventHandler(GetApplicationEventTarget(), { _, evt, ctx in + let hotKeyIDPtr = UnsafeMutablePointer.allocate(capacity: 1) + GetEventParameter(evt!, EventParamName(kEventParamDirectObject), EventParamType(typeEventHotKeyID), nil, MemoryLayout.size, nil, hotKeyIDPtr) + Unmanaged.fromOpaque(ctx!).takeUnretainedValue().handler?() + return noErr + }, 1, [eventSpec], Unmanaged.passUnretained(self).toOpaque(), nil) + } + + deinit { + if let ref { UnregisterEventHotKey(ref) } + } +} + +let optionKey: UInt32 = UInt32(cmdKey) >> 8 diff --git a/examples/WhisperSpotlight/ModelManager.swift b/examples/WhisperSpotlight/ModelManager.swift new file mode 100644 index 00000000000..0b62df3cb44 --- /dev/null +++ b/examples/WhisperSpotlight/ModelManager.swift @@ -0,0 +1,52 @@ +import Foundation + +struct ModelManager { + private let fileManager = FileManager.default + private let modelFile = "ggml-large-v3-turbo.bin" + private let url = URL(string: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin")! + + func modelPath() -> URL { + let app = fileManager.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0] + .appending(path: "WhisperSpotlight") + try? fileManager.createDirectory(at: app, withIntermediateDirectories: true) + return app.appending(path: modelFile) + } + + func ensureModel(progress: ((Double) -> Void)? = nil) async throws { + let path = modelPath() + if fileManager.fileExists(atPath: path.path) { return } + try await downloadModel(to: path, progress: progress) + } + + private func downloadModel(to path: URL, progress: ((Double) -> Void)?) async throws { + let request = URLRequest(url: url) + for _ in 0..<3 { + do { + let (temp, response) = try await URLSession.shared.download(for: request, delegate: ProgressDelegate(progress)) + guard let http = response as? HTTPURLResponse, (200...299).contains(http.statusCode) else { throw URLError(.badServerResponse) } + try fileManager.moveItem(at: temp, to: path) + let attr = try fileManager.attributesOfItem(atPath: path.path) + if let size = attr[.size] as? NSNumber, size.intValue > 1_400_000_000 { return } + } catch { + try? fileManager.removeItem(at: path) + continue + } + } + throw URLError(.cannotCreateFile) + } +} + +private class ProgressDelegate: NSObject, URLSessionTaskDelegate { + let callback: ((Double) -> Void)? + init(_ cb: ((Double) -> Void)?) { self.callback = cb } + func urlSession(_ session: URLSession, task: URLSessionTask, didSendBodyData bytesSent: Int64, totalBytesSent: Int64, totalBytesExpectedToSend: Int64) { + if totalBytesExpectedToSend > 0 { + callback?(Double(totalBytesSent)/Double(totalBytesExpectedToSend)) + } + } + func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didWriteData bytesWritten: Int64, totalBytesWritten: Int64, totalBytesExpectedToWrite: Int64) { + if totalBytesExpectedToWrite > 0 { + callback?(Double(totalBytesWritten)/Double(totalBytesExpectedToWrite)) + } + } +} diff --git a/examples/WhisperSpotlight/OverlayView.swift b/examples/WhisperSpotlight/OverlayView.swift new file mode 100644 index 00000000000..34beac17782 --- /dev/null +++ b/examples/WhisperSpotlight/OverlayView.swift @@ -0,0 +1,81 @@ +import SwiftUI +import AppKit + +enum OverlayState { + case idle, listening, transcribing(String), done(String) +} + +struct OverlayView: View { + @State private var state: OverlayState = .idle + @State private var recorder = Recorder() + @State private var modelURL: URL? = nil + @State private var manager = ModelManager() + @State private var task: Task? = nil + + var body: some View { + VStack { + switch state { + case .idle: + Image(systemName: "mic") + .onTapGesture { toggleListening() } + case .listening: + ProgressView("Listening…") + .onAppear { startRecording() } + case .transcribing(let text): + ProgressView(text) + case .done(let text): + Text(text) + } + } + .frame(width: 320, height: 200) + .background(Material.thick) + .cornerRadius(12) + .onReceive(NotificationCenter.default.publisher(for: .toggleOverlay)) { _ in + toggleListening() + } + } + + private func toggleListening() { + switch state { + case .idle: state = .listening + case .listening: stopRecording() + default: break + } + } + + private func startRecording() { + task = Task { + do { + try await manager.ensureModel() + let file = try FileManager.default + .temporaryDirectory.appending(path: "record.wav") + try await recorder.startRecording(toOutputFile: file, delegate: nil) + } catch {} + } + } + + private func stopRecording() { + task?.cancel() + Task { + recorder.stopRecording() + if let url = recorder.currentFile { + state = .transcribing("Transcribing…") + let ctx = try? WhisperContext.createContext(path: manager.modelPath().path()) + if let data = try? decodeWaveFile(url) { + ctx?.fullTranscribe(samples: data, language: "") + let text = ctx?.getTranscription() ?? "" + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(text, forType: .string) + state = .done(text) + } + } + DispatchQueue.main.asyncAfter(deadline: .now() + 2) { + state = .idle + } + } + } +} + +extension Notification.Name { + static let toggleOverlay = Notification.Name("ToggleOverlay") +} diff --git a/examples/WhisperSpotlight/Package.swift b/examples/WhisperSpotlight/Package.swift new file mode 100644 index 00000000000..c82bfce2110 --- /dev/null +++ b/examples/WhisperSpotlight/Package.swift @@ -0,0 +1,15 @@ +// swift-tools-version:5.7 +import PackageDescription + +let package = Package( + name: "WhisperSpotlight", + platforms: [.macOS(.v13)], + products: [ + .library(name: "WhisperSpotlight", targets: ["WhisperSpotlight"]) + ], + dependencies: [], + targets: [ + .target(name: "WhisperSpotlight", dependencies: [], path: "", exclude: ["Tests"]), + .testTarget(name: "WhisperSpotlightTests", dependencies: ["WhisperSpotlight"], path: "Tests") + ] +) diff --git a/examples/WhisperSpotlight/Recorder.swift b/examples/WhisperSpotlight/Recorder.swift new file mode 100644 index 00000000000..41ea82243b0 --- /dev/null +++ b/examples/WhisperSpotlight/Recorder.swift @@ -0,0 +1,26 @@ +import Foundation +import AVFoundation + +actor Recorder { + private var recorder: AVAudioRecorder? + private(set) var currentFile: URL? + + func startRecording(toOutputFile url: URL, delegate: AVAudioRecorderDelegate?) throws { + currentFile = url + let settings: [String: Any] = [ + AVFormatIDKey: Int(kAudioFormatLinearPCM), + AVSampleRateKey: 16000.0, + AVNumberOfChannelsKey: 1, + AVEncoderAudioQualityKey: AVAudioQuality.high.rawValue + ] + let rec = try AVAudioRecorder(url: url, settings: settings) + rec.delegate = delegate + guard rec.record() else { throw NSError(domain: "rec", code: 1) } + recorder = rec + } + + func stopRecording() { + recorder?.stop() + recorder = nil + } +} diff --git a/examples/WhisperSpotlight/RiffWaveUtils.swift b/examples/WhisperSpotlight/RiffWaveUtils.swift new file mode 100644 index 00000000000..ea8670843f4 --- /dev/null +++ b/examples/WhisperSpotlight/RiffWaveUtils.swift @@ -0,0 +1,12 @@ +import Foundation + +func decodeWaveFile(_ url: URL) throws -> [Float] { + let data = try Data(contentsOf: url) + let floats = stride(from: 44, to: data.count, by: 2).map { + return data[$0..<$0 + 2].withUnsafeBytes { + let short = Int16(littleEndian: $0.load(as: Int16.self)) + return max(-1.0, min(Float(short) / 32767.0, 1.0)) + } + } + return floats +} diff --git a/examples/WhisperSpotlight/Tests/IntegrationTests.swift b/examples/WhisperSpotlight/Tests/IntegrationTests.swift new file mode 100644 index 00000000000..b690a4a6b32 --- /dev/null +++ b/examples/WhisperSpotlight/Tests/IntegrationTests.swift @@ -0,0 +1,10 @@ +import XCTest +@testable import WhisperSpotlight + +final class IntegrationTests: XCTestCase { + func testClipboardWrite() throws { + NSPasteboard.general.clearContents() + NSPasteboard.general.setString("hello", forType: .string) + XCTAssertEqual(NSPasteboard.general.string(forType: .string), "hello") + } +} diff --git a/examples/WhisperSpotlight/Tests/ModelManagerTests.swift b/examples/WhisperSpotlight/Tests/ModelManagerTests.swift new file mode 100644 index 00000000000..522af7db578 --- /dev/null +++ b/examples/WhisperSpotlight/Tests/ModelManagerTests.swift @@ -0,0 +1,10 @@ +import XCTest +@testable import WhisperSpotlight + +final class ModelManagerTests: XCTestCase { + func testModelPathCreation() throws { + let manager = ModelManager() + let path = manager.modelPath() + XCTAssertTrue(path.path.contains("WhisperSpotlight")) + } +} diff --git a/examples/WhisperSpotlight/Tests/RecorderTests.swift b/examples/WhisperSpotlight/Tests/RecorderTests.swift new file mode 100644 index 00000000000..5168bbcb80b --- /dev/null +++ b/examples/WhisperSpotlight/Tests/RecorderTests.swift @@ -0,0 +1,13 @@ +import XCTest +@testable import WhisperSpotlight + +final class RecorderTests: XCTestCase { + func testWavHeader() async throws { + let recorder = Recorder() + let url = FileManager.default.temporaryDirectory.appending(path: "test.wav") + try await recorder.startRecording(toOutputFile: url, delegate: nil) + recorder.stopRecording() + let data = try Data(contentsOf: url) + XCTAssertEqual(String(data: data.prefix(4), encoding: .ascii), "RIFF") + } +} diff --git a/examples/WhisperSpotlight/WhisperSpotlightApp.swift b/examples/WhisperSpotlight/WhisperSpotlightApp.swift new file mode 100644 index 00000000000..baff62c53f9 --- /dev/null +++ b/examples/WhisperSpotlight/WhisperSpotlightApp.swift @@ -0,0 +1,21 @@ +import SwiftUI + +@main +struct WhisperSpotlightApp: App { + @NSApplicationDelegateAdaptor(AppDelegate.self) var delegate + var body: some Scene { + WindowGroup { + OverlayView() + } + } +} + +class AppDelegate: NSObject, NSApplicationDelegate { + private var hotkey: GlobalHotkey? + func applicationDidFinishLaunching(_ notification: Notification) { + hotkey = GlobalHotkey(keyCode: kVK_Space, modifiers: optionKey) + hotkey?.handler = { + NotificationCenter.default.post(name: .toggleOverlay, object: nil) + } + } +} diff --git a/examples/WhisperSpotlight/whispercpp/LibWhisper.swift b/examples/WhisperSpotlight/whispercpp/LibWhisper.swift new file mode 100644 index 00000000000..3b3f0fdd04d --- /dev/null +++ b/examples/WhisperSpotlight/whispercpp/LibWhisper.swift @@ -0,0 +1,157 @@ +import Foundation +import AppKit +import whisper + +enum WhisperError: Error { + case couldNotInitializeContext +} + +// Meet Whisper C++ constraint: Don't access from more than one thread at a time. +actor WhisperContext { + private var context: OpaquePointer + + init(context: OpaquePointer) { + self.context = context + } + + deinit { + whisper_free(context) + } + + func fullTranscribe(samples: [Float], language: String = "") { + let maxThreads = max(1, min(8, cpuCount() - 2)) + print("Selecting \(maxThreads) threads") + var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY) + params.print_realtime = true + params.print_progress = false + params.print_timestamps = true + params.print_special = false + params.translate = false + params.n_threads = Int32(maxThreads) + params.offset_ms = 0 + params.no_context = true + params.single_segment = false + + samples.withUnsafeBufferPointer { samples in + language.withCString { lang in + params.language = lang + params.detect_language = language.isEmpty + whisper_reset_timings(context) + print("About to run whisper_full") + if (whisper_full(context, params, samples.baseAddress, Int32(samples.count)) != 0) { + print("Failed to run the model") + } else { + whisper_print_timings(context) + } + } + } + } + + func getTranscription() -> String { + var transcription = "" + for i in 0.. String { + return String.init(cString: whisper_bench_memcpy_str(nThreads)) + } + + static func benchGgmlMulMat(nThreads: Int32) async -> String { + return String.init(cString: whisper_bench_ggml_mul_mat_str(nThreads)) + } + + private func systemInfo() -> String { + var info = "" + //if (ggml_cpu_has_neon() != 0) { info += "NEON " } + return String(info.dropLast()) + } + + func benchFull(modelName: String, nThreads: Int32) async -> String { + let nMels = whisper_model_n_mels(context) + if (whisper_set_mel(context, nil, 0, nMels) != 0) { + return "error: failed to set mel" + } + + // heat encoder + if (whisper_encode(context, 0, nThreads) != 0) { + return "error: failed to encode" + } + + var tokens = [whisper_token](repeating: 0, count: 512) + + // prompt heat + if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) { + return "error: failed to decode" + } + + // text-generation heat + if (whisper_decode(context, &tokens, 1, 256, nThreads) != 0) { + return "error: failed to decode" + } + + whisper_reset_timings(context) + + // actual run + if (whisper_encode(context, 0, nThreads) != 0) { + return "error: failed to encode" + } + + // text-generation + for i in 0..<256 { + if (whisper_decode(context, &tokens, 1, Int32(i), nThreads) != 0) { + return "error: failed to decode" + } + } + + // batched decoding + for _ in 0..<64 { + if (whisper_decode(context, &tokens, 5, 0, nThreads) != 0) { + return "error: failed to decode" + } + } + + // prompt processing + for _ in 0..<16 { + if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) { + return "error: failed to decode" + } + } + + whisper_print_timings(context) + + let deviceModel = Host.current().localizedName ?? "Mac" + let systemName = ProcessInfo.processInfo.operatingSystemVersionString + let systemInfo = self.systemInfo() + let timings: whisper_timings = whisper_get_timings(context).pointee + let encodeMs = String(format: "%.2f", timings.encode_ms) + let decodeMs = String(format: "%.2f", timings.decode_ms) + let batchdMs = String(format: "%.2f", timings.batchd_ms) + let promptMs = String(format: "%.2f", timings.prompt_ms) + return "| \(deviceModel) | \(systemName) | \(systemInfo) | \(modelName) | \(nThreads) | 1 | \(encodeMs) | \(decodeMs) | \(batchdMs) | \(promptMs) | |" + } + + static func createContext(path: String) throws -> WhisperContext { + var params = whisper_context_default_params() +#if targetEnvironment(simulator) + params.use_gpu = false + print("Running on the simulator, using CPU") +#else + params.use_gpu = true +#endif + params.flash_attn = true + let context = whisper_init_from_file_with_params(path, params) + if let context { + return WhisperContext(context: context) + } else { + print("Couldn't load model at \(path)") + throw WhisperError.couldNotInitializeContext + } + } +} + +fileprivate func cpuCount() -> Int { + ProcessInfo.processInfo.processorCount +} diff --git a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift index 3c769e7af8e..8195d047a4a 100644 --- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift +++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift @@ -18,27 +18,26 @@ actor WhisperContext { whisper_free(context) } - func fullTranscribe(samples: [Float]) { - // Leave 2 processors free (i.e. the high-efficiency cores). + func fullTranscribe(samples: [Float], language: String = "") { let maxThreads = max(1, min(8, cpuCount() - 2)) print("Selecting \(maxThreads) threads") var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY) - "en".withCString { en in - // Adapted from whisper.objc - params.print_realtime = true - params.print_progress = false - params.print_timestamps = true - params.print_special = false - params.translate = false - params.language = en - params.n_threads = Int32(maxThreads) - params.offset_ms = 0 - params.no_context = true - params.single_segment = false - - whisper_reset_timings(context) - print("About to run whisper_full") - samples.withUnsafeBufferPointer { samples in + params.print_realtime = true + params.print_progress = false + params.print_timestamps = true + params.print_special = false + params.translate = false + params.n_threads = Int32(maxThreads) + params.offset_ms = 0 + params.no_context = true + params.single_segment = false + + samples.withUnsafeBufferPointer { samples in + language.withCString { lang in + params.language = lang + params.detect_language = language.isEmpty + whisper_reset_timings(context) + print("About to run whisper_full") if (whisper_full(context, params, samples.baseAddress, Int32(samples.count)) != 0) { print("Failed to run the model") } else { @@ -140,8 +139,9 @@ actor WhisperContext { params.use_gpu = false print("Running on the simulator, using CPU") #else - params.flash_attn = true // Enabled by default for Metal + params.use_gpu = true #endif + params.flash_attn = true let context = whisper_init_from_file_with_params(path, params) if let context { return WhisperContext(context: context)