From aa562fc55cc5eb85a5973d79dc4959eee78d1994 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Sun, 28 Dec 2025 02:41:30 +0100 Subject: [PATCH 1/4] Add model loading benchmarks --- Package.swift | 12 ++ Tests/Benchmarks/ModelLoadingBenchmarks.swift | 112 ++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 Tests/Benchmarks/ModelLoadingBenchmarks.swift diff --git a/Package.swift b/Package.swift index b00ec470..9cb51a20 100644 --- a/Package.swift +++ b/Package.swift @@ -104,6 +104,18 @@ let package = Package( .enableExperimentalFeature("StrictConcurrency") ] ), + .testTarget( + name: "Benchmarks", + dependencies: [ + "MLXLLM", + "MLXVLM", + "MLXLMCommon", + ], + path: "Tests/Benchmarks", + swiftSettings: [ + .enableExperimentalFeature("StrictConcurrency") + ] + ), .target( name: "MLXEmbedders", dependencies: [ diff --git a/Tests/Benchmarks/ModelLoadingBenchmarks.swift b/Tests/Benchmarks/ModelLoadingBenchmarks.swift new file mode 100644 index 00000000..a2dfc714 --- /dev/null +++ b/Tests/Benchmarks/ModelLoadingBenchmarks.swift @@ -0,0 +1,112 @@ +import Foundation +import Hub +import MLX +import MLXLLM +import MLXLMCommon +import MLXVLM +import Testing + +private let benchmarksEnabled = ProcessInfo.processInfo.environment["RUN_BENCHMARKS"] != nil + +private struct BenchmarkStats { + let mean: Double + let median: Double + let stdDev: Double + let min: Double + let max: Double + + init(times: [Double]) { + precondition(!times.isEmpty, "BenchmarkStats requires at least one timing measurement") + let sorted = times.sorted() + self.min = sorted.first ?? 0 + self.max = sorted.last ?? 0 + let mean = times.reduce(0, +) / Double(times.count) + self.mean = mean + self.median = sorted[sorted.count / 2] + + let squaredDiffs = times.map { ($0 - mean) * ($0 - mean) } + self.stdDev = sqrt(squaredDiffs.reduce(0, +) / Double(times.count)) + } + + func printSummary(label: String) { + print("\(label) results:") + print(" Mean: \(String(format: "%.0f", mean))ms") + print(" Median: \(String(format: "%.0f", median))ms") + print(" StdDev: \(String(format: "%.1f", stdDev))ms") + print(" Range: \(String(format: "%.0f", min))-\(String(format: "%.0f", max))ms") + } +} + +@Suite(.serialized) +struct ModelLoadingBenchmarks { + + /// Benchmark LLM model loading + /// Tests: parallel tokenizer/weights, single config.json read + @Test(.enabled(if: benchmarksEnabled)) + func loadLLM() async throws { + let modelId = "mlx-community/Qwen3-0.6B-4bit" + let hub = HubApi() + let config = ModelConfiguration(id: modelId) + + // Warm-up run: ensure model is downloaded and caches are primed + _ = try await LLMModelFactory.shared.load(hub: hub, configuration: config) { _ in } + GPU.clearCache() + + // Benchmark multiple runs + let runs = 7 + var times: [Double] = [] + + for i in 1 ... runs { + let start = CFAbsoluteTimeGetCurrent() + + _ = try await LLMModelFactory.shared.load( + hub: hub, + configuration: config + ) { _ in } + + let elapsed = (CFAbsoluteTimeGetCurrent() - start) * 1000 + times.append(elapsed) + print("LLM load run \(i): \(String(format: "%.0f", elapsed))ms") + + // Clear GPU cache to ensure independent measurements + GPU.clearCache() + } + + BenchmarkStats(times: times).printSummary(label: "LLM load") + } + + /// Benchmark VLM model loading + /// Tests: parallel tokenizer/weights, single config.json read, parallel processor config + @Test(.enabled(if: benchmarksEnabled)) + func loadVLM() async throws { + let modelId = "mlx-community/Qwen2-VL-2B-Instruct-4bit" + let hub = HubApi() + let config = ModelConfiguration(id: modelId) + + // Warm-up run: ensure model is downloaded and caches are primed + _ = try await VLMModelFactory.shared.load(hub: hub, configuration: config) { _ in } + GPU.clearCache() + + // Benchmark multiple runs + let runs = 7 + var times: [Double] = [] + + for i in 1 ... runs { + let start = CFAbsoluteTimeGetCurrent() + + _ = try await VLMModelFactory.shared.load( + hub: hub, + configuration: config + ) { _ in } + + let elapsed = (CFAbsoluteTimeGetCurrent() - start) * 1000 + times.append(elapsed) + print("VLM load run \(i): \(String(format: "%.0f", elapsed))ms") + + // Clear GPU cache to ensure independent measurements + GPU.clearCache() + } + + BenchmarkStats(times: times).printSummary(label: "VLM load") + } +} From c5725ae2c8ffbe23d15207939b56365a02e99c1e Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Sun, 28 Dec 2025 00:39:22 +0100 Subject: [PATCH 2/4] Parallelize loading of weights, tokenizer, and processor config --- Libraries/Embedders/Configuration.swift | 64 ++++++--------- Libraries/Embedders/EmbeddingModel.swift | 10 ++- Libraries/Embedders/Load.swift | 14 ++-- Libraries/MLXLLM/LLMModelFactory.swift | 26 ++++--- .../Registries/ModelTypeRegistry.swift | 11 +-- .../Registries/ProcessorTypeRegistry.swift | 10 +-- Libraries/MLXVLM/VLMModelFactory.swift | 78 +++++++++---------- 7 files changed, 100 insertions(+), 113 deletions(-) diff --git a/Libraries/Embedders/Configuration.swift b/Libraries/Embedders/Configuration.swift index 90517291..228babde 100644 --- a/Libraries/Embedders/Configuration.swift +++ b/Libraries/Embedders/Configuration.swift @@ -33,60 +33,42 @@ private class ModelTypeRegistry: @unchecked Sendable { // to remain synchronous. private let lock = NSLock() - private var creators: [String: @Sendable (URL) throws -> EmbeddingModel] = [ - "bert": { - url in - let configuration = try JSONDecoder().decode( - BertConfiguration.self, from: Data(contentsOf: url)) - let model = BertModel(configuration) - return model + private var creators: [String: @Sendable (Data) throws -> EmbeddingModel] = [ + "bert": { data in + let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data) + return BertModel(configuration) }, - "roberta": { - url in - let configuration = try JSONDecoder().decode( - BertConfiguration.self, from: Data(contentsOf: url)) - let model = BertModel(configuration) - return model + "roberta": { data in + let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data) + return BertModel(configuration) }, - "xlm-roberta": { - url in - let configuration = try JSONDecoder().decode( - BertConfiguration.self, from: Data(contentsOf: url)) - let model = BertModel(configuration) - return model + "xlm-roberta": { data in + let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data) + return BertModel(configuration) }, - "distilbert": { - url in - let configuration = try JSONDecoder().decode( - BertConfiguration.self, from: Data(contentsOf: url)) - let model = BertModel(configuration) - return model + "distilbert": { data in + let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data) + return BertModel(configuration) }, - "nomic_bert": { - url in - let configuration = try JSONDecoder().decode( - NomicBertConfiguration.self, from: Data(contentsOf: url)) - let model = NomicBertModel(configuration, pooler: false) - return model + "nomic_bert": { data in + let configuration = try JSONDecoder().decode(NomicBertConfiguration.self, from: data) + return NomicBertModel(configuration, pooler: false) }, - "qwen3": { - url in - let configuration = try JSONDecoder().decode( - Qwen3Configuration.self, from: Data(contentsOf: url)) - let model = Qwen3Model(configuration) - return model + "qwen3": { data in + let configuration = try JSONDecoder().decode(Qwen3Configuration.self, from: data) + return Qwen3Model(configuration) }, ] public func registerModelType( - _ type: String, creator: @Sendable @escaping (URL) throws -> EmbeddingModel + _ type: String, creator: @Sendable @escaping (Data) throws -> EmbeddingModel ) { lock.withLock { creators[type] = creator } } - public func createModel(configuration: URL, rawValue: String) throws -> EmbeddingModel { + public func createModel(configuration: Data, rawValue: String) throws -> EmbeddingModel { let creator = lock.withLock { creators[rawValue] } @@ -108,12 +90,12 @@ public struct ModelType: RawRepresentable, Codable, Sendable { } public static func registerModelType( - _ type: String, creator: @Sendable @escaping (URL) throws -> EmbeddingModel + _ type: String, creator: @Sendable @escaping (Data) throws -> EmbeddingModel ) { modelTypeRegistry.registerModelType(type, creator: creator) } - public func createModel(configuration: URL) throws -> EmbeddingModel { + public func createModel(configuration: Data) throws -> EmbeddingModel { try modelTypeRegistry.createModel(configuration: configuration, rawValue: rawValue) } } diff --git a/Libraries/Embedders/EmbeddingModel.swift b/Libraries/Embedders/EmbeddingModel.swift index 3c4fbed7..1513387f 100644 --- a/Libraries/Embedders/EmbeddingModel.swift +++ b/Libraries/Embedders/EmbeddingModel.swift @@ -46,13 +46,17 @@ public actor ModelContainer { public init( hub: HubApi, modelDirectory: URL, configuration: ModelConfiguration ) async throws { + // Start tokenizer config loading asynchronously, then load model synchronously. + // Both operations run in parallel because async let begins execution immediately. + async let tokenizerConfigTask = loadTokenizerConfig( + configuration: configuration, hub: hub) + self.model = try loadSynchronous(modelDirectory: modelDirectory) + self.pooler = loadPooling(modelDirectory: modelDirectory) - let (tokenizerConfig, tokenizerData) = try await loadTokenizerConfig( - configuration: configuration, hub: hub) + let (tokenizerConfig, tokenizerData) = try await tokenizerConfigTask self.tokenizer = try PreTrainedTokenizer( tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) - self.pooler = loadPooling(modelDirectory: modelDirectory) //?? Pooling(strategy: .none) } /// Perform an action on the model and/or tokenizer. Callers _must_ eval any `MLXArray` before returning as diff --git a/Libraries/Embedders/Load.swift b/Libraries/Embedders/Load.swift index 488f3a34..0ef5010d 100644 --- a/Libraries/Embedders/Load.swift +++ b/Libraries/Embedders/Load.swift @@ -49,20 +49,24 @@ public func load( ) async throws -> (EmbeddingModel, Tokenizer) { let modelDirectory = try await prepareModelDirectory( hub: hub, configuration: configuration, progressHandler: progressHandler) + + // Start tokenizer loading asynchronously, then load model synchronously. + // Both operations run in parallel because async let begins execution immediately. + async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) let model = try loadSynchronous(modelDirectory: modelDirectory) - let tokenizer = try await loadTokenizer(configuration: configuration, hub: hub) + let tokenizer = try await tokenizerTask return (model, tokenizer) } func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel { - // create the model (no weights loaded) + // Load config.json once and decode for both base config and model-specific config let configurationURL = modelDirectory.appending(component: "config.json") - let baseConfig = try JSONDecoder().decode( - BaseConfiguration.self, from: Data(contentsOf: configurationURL)) + let configData = try Data(contentsOf: configurationURL) + let baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) let modelType = ModelType(rawValue: baseConfig.modelType) - let model = try modelType.createModel(configuration: configurationURL) + let model = try modelType.createModel(configuration: configData) // load the weights var weights = [String: MLXArray]() diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift index c6f105cd..09d70e56 100644 --- a/Libraries/MLXLLM/LLMModelFactory.swift +++ b/Libraries/MLXLLM/LLMModelFactory.swift @@ -6,13 +6,12 @@ import MLX import MLXLMCommon import Tokenizers -/// Creates a function that loads a configuration file and instantiates a model with the proper configuration +/// Creates a function that decodes configuration data and instantiates a model with the proper configuration private func create( _ configurationType: C.Type, _ modelInit: @escaping (C) -> M -) -> (URL) throws -> M { - { url in - let configuration = try JSONDecoder().decode( - C.self, from: Data(contentsOf: url)) +) -> (Data) throws -> M { + { data in + let configuration = try JSONDecoder().decode(C.self, from: data) return modelInit(configuration) } } @@ -478,13 +477,13 @@ public final class LLMModelFactory: ModelFactory { let modelDirectory = try await downloadModel( hub: hub, configuration: configuration, progressHandler: progressHandler) - // Load the generic config to understand which model and how to load the weights + // Load config.json once and decode for both base config and model-specific config let configurationURL = modelDirectory.appending(component: "config.json") - + let configData: Data let baseConfig: BaseConfiguration do { - baseConfig = try JSONDecoder().decode( - BaseConfiguration.self, from: Data(contentsOf: configurationURL)) + configData = try Data(contentsOf: configurationURL) + baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) } catch let error as DecodingError { throw ModelFactoryError.configurationDecodingError( configurationURL.lastPathComponent, configuration.name, error) @@ -493,18 +492,21 @@ public final class LLMModelFactory: ModelFactory { let model: LanguageModel do { model = try await typeRegistry.createModel( - configuration: configurationURL, modelType: baseConfig.modelType) + configuration: configData, modelType: baseConfig.modelType) } catch let error as DecodingError { throw ModelFactoryError.configurationDecodingError( configurationURL.lastPathComponent, configuration.name, error) } - // apply the weights to the bare model + // Start tokenizer loading asynchronously, then load weights synchronously. + // Both operations run in parallel because async let begins execution immediately. + async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) + try loadWeights( modelDirectory: modelDirectory, model: model, perLayerQuantization: baseConfig.perLayerQuantization) - let tokenizer = try await loadTokenizer(configuration: configuration, hub: hub) + let tokenizer = try await tokenizerTask let messageGenerator = if let model = model as? LLMModel { diff --git a/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift b/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift index 8fb2bcaf..a610f5e0 100644 --- a/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift +++ b/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift @@ -10,21 +10,22 @@ public actor ModelTypeRegistry { } /// Creates a registry with given creators. - public init(creators: [String: (URL) throws -> any LanguageModel]) { + public init(creators: [String: (Data) throws -> any LanguageModel]) { self.creators = creators } - private var creators: [String: (URL) throws -> any LanguageModel] + private var creators: [String: (Data) throws -> any LanguageModel] /// Add a new model to the type registry. public func registerModelType( - _ type: String, creator: @escaping (URL) throws -> any LanguageModel + _ type: String, creator: @escaping (Data) throws -> any LanguageModel ) { creators[type] = creator } - /// Given a `modelType` and configuration file instantiate a new `LanguageModel`. - public func createModel(configuration: URL, modelType: String) throws -> sending LanguageModel { + /// Given a `modelType` and configuration data instantiate a new `LanguageModel`. + public func createModel(configuration: Data, modelType: String) throws -> sending LanguageModel + { guard let creator = creators[modelType] else { throw ModelFactoryError.unsupportedModelType(modelType) } diff --git a/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift b/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift index 6c4c91eb..67d1491e 100644 --- a/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift +++ b/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift @@ -11,26 +11,26 @@ public actor ProcessorTypeRegistry { } /// Creates a registry with given creators. - public init(creators: [String: (URL, any Tokenizer) throws -> any UserInputProcessor]) { + public init(creators: [String: (Data, any Tokenizer) throws -> any UserInputProcessor]) { self.creators = creators } - private var creators: [String: (URL, any Tokenizer) throws -> any UserInputProcessor] + private var creators: [String: (Data, any Tokenizer) throws -> any UserInputProcessor] /// Add a new model to the type registry. public func registerProcessorType( _ type: String, creator: @escaping ( - URL, + Data, any Tokenizer ) throws -> any UserInputProcessor ) { creators[type] = creator } - /// Given a `processorType` and configuration file instantiate a new `UserInputProcessor`. - public func createModel(configuration: URL, processorType: String, tokenizer: any Tokenizer) + /// Given a `processorType` and configuration data instantiate a new `UserInputProcessor`. + public func createModel(configuration: Data, processorType: String, tokenizer: any Tokenizer) throws -> sending any UserInputProcessor { guard let creator = creators[processorType] else { diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift index c047a2e2..b27786d1 100644 --- a/Libraries/MLXVLM/VLMModelFactory.swift +++ b/Libraries/MLXVLM/VLMModelFactory.swift @@ -48,10 +48,9 @@ public struct BaseProcessorConfiguration: Codable, Sendable { /// Creates a function that loads a configuration file and instantiates a model with the proper configuration private func create( _ configurationType: C.Type, _ modelInit: @escaping (C) -> M -) -> (URL) throws -> M { - { url in - let configuration = try JSONDecoder().decode( - C.self, from: Data(contentsOf: url)) +) -> (Data) throws -> M { + { data in + let configuration = try JSONDecoder().decode(C.self, from: data) return modelInit(configuration) } } @@ -63,10 +62,9 @@ private func create( C, any Tokenizer ) -> P -) -> (URL, any Tokenizer) throws -> P { - { url, tokenizer in - let configuration = try JSONDecoder().decode( - C.self, from: Data(contentsOf: url)) +) -> (Data, any Tokenizer) throws -> P { + { data, tokenizer in + let configuration = try JSONDecoder().decode(C.self, from: data) return processorInit(configuration, tokenizer) } } @@ -258,15 +256,13 @@ public final class VLMModelFactory: ModelFactory { let modelDirectory = try await downloadModel( hub: hub, configuration: configuration, progressHandler: progressHandler) - // load the generic config to understand which model and how to load the weights - let configurationURL = modelDirectory.appending( - component: "config.json" - ) - + // Load config.json once and decode for both base config and model-specific config + let configurationURL = modelDirectory.appending(component: "config.json") + let configData: Data let baseConfig: BaseConfiguration do { - baseConfig = try JSONDecoder().decode( - BaseConfiguration.self, from: Data(contentsOf: configurationURL)) + configData = try Data(contentsOf: configurationURL) + baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) } catch let error as DecodingError { throw ModelFactoryError.configurationDecodingError( configurationURL.lastPathComponent, configuration.name, error) @@ -275,43 +271,26 @@ public final class VLMModelFactory: ModelFactory { let model: LanguageModel do { model = try await typeRegistry.createModel( - configuration: configurationURL, modelType: baseConfig.modelType) + configuration: configData, modelType: baseConfig.modelType) } catch let error as DecodingError { throw ModelFactoryError.configurationDecodingError( configurationURL.lastPathComponent, configuration.name, error) } - // apply the weights to the bare model + // Start tokenizer and processor config loading asynchronously, then load weights synchronously. + // All three operations run in parallel because async let begins execution immediately. + async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) + async let processorConfigTask = loadProcessorConfig(from: modelDirectory) + try loadWeights( modelDirectory: modelDirectory, model: model, perLayerQuantization: baseConfig.perLayerQuantization) - let tokenizer = try await loadTokenizer( - configuration: configuration, - hub: hub - ) - - // Support both processor_config.json and preprocessor_config.json (prefer preprocessor_config.json) - let processorConfigURL = modelDirectory.appending(component: "processor_config.json") - let preprocessorConfigURL = modelDirectory.appending(component: "preprocessor_config.json") - let processorConfigurationURL = - FileManager.default.fileExists(atPath: preprocessorConfigURL.path) - ? preprocessorConfigURL - : processorConfigURL - - let baseProcessorConfig: BaseProcessorConfiguration - do { - baseProcessorConfig = try JSONDecoder().decode( - BaseProcessorConfiguration.self, - from: Data(contentsOf: processorConfigurationURL) - ) - } catch let error as DecodingError { - throw ModelFactoryError.configurationDecodingError( - processorConfigurationURL.lastPathComponent, configuration.name, error) - } + let tokenizer = try await tokenizerTask + let (processorConfigData, baseProcessorConfig) = try await processorConfigTask // Override processor type based on model type for models that need special handling - // Mistral3 model ship with "PixtralProcessor" in their config but need Mistral3Processor + // Mistral3 models ship with "PixtralProcessor" in their config but need Mistral3Processor // to handle spatial merging correctly let processorTypeOverrides: [String: String] = [ "mistral3": "Mistral3Processor" @@ -320,7 +299,7 @@ public final class VLMModelFactory: ModelFactory { processorTypeOverrides[baseConfig.modelType] ?? baseProcessorConfig.processorClass let processor = try await processorRegistry.createModel( - configuration: processorConfigurationURL, + configuration: processorConfigData, processorType: processorType, tokenizer: tokenizer) return .init( @@ -329,6 +308,21 @@ public final class VLMModelFactory: ModelFactory { } +/// Loads processor configuration, preferring preprocessor_config.json over processor_config.json. +private func loadProcessorConfig(from modelDirectory: URL) async throws -> ( + Data, BaseProcessorConfiguration +) { + let processorConfigURL = modelDirectory.appending(component: "processor_config.json") + let preprocessorConfigURL = modelDirectory.appending(component: "preprocessor_config.json") + let url = + FileManager.default.fileExists(atPath: preprocessorConfigURL.path) + ? preprocessorConfigURL + : processorConfigURL + let data = try Data(contentsOf: url) + let config = try JSONDecoder().decode(BaseProcessorConfiguration.self, from: data) + return (data, config) +} + public class TrampolineModelFactory: NSObject, ModelFactoryTrampoline { public static func modelFactory() -> (any MLXLMCommon.ModelFactory)? { VLMModelFactory.shared From 58dbb8e33350e1478bbfc923788a9bed3d9948e8 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Mon, 29 Dec 2025 09:53:11 +0100 Subject: [PATCH 3/4] Improve error handling --- Libraries/Embedders/Configuration.swift | 2 +- Libraries/Embedders/EmbeddingModel.swift | 3 +- Libraries/Embedders/Load.swift | 72 +++++++++++++++++++++--- Libraries/Embedders/Tokenizer.swift | 2 +- Libraries/MLXLLM/LLMModelFactory.swift | 7 ++- Libraries/MLXLMCommon/ModelFactory.swift | 3 + Libraries/MLXVLM/VLMModelFactory.swift | 37 ++++++++++-- 7 files changed, 110 insertions(+), 16 deletions(-) diff --git a/Libraries/Embedders/Configuration.swift b/Libraries/Embedders/Configuration.swift index 228babde..3a20c91c 100644 --- a/Libraries/Embedders/Configuration.swift +++ b/Libraries/Embedders/Configuration.swift @@ -73,7 +73,7 @@ private class ModelTypeRegistry: @unchecked Sendable { creators[rawValue] } guard let creator else { - throw EmbedderError(message: "Unsupported model type.") + throw EmbedderError.unsupportedModelType(rawValue) } return try creator(configuration) } diff --git a/Libraries/Embedders/EmbeddingModel.swift b/Libraries/Embedders/EmbeddingModel.swift index 1513387f..67db2d7c 100644 --- a/Libraries/Embedders/EmbeddingModel.swift +++ b/Libraries/Embedders/EmbeddingModel.swift @@ -51,7 +51,8 @@ public actor ModelContainer { async let tokenizerConfigTask = loadTokenizerConfig( configuration: configuration, hub: hub) - self.model = try loadSynchronous(modelDirectory: modelDirectory) + self.model = try loadSynchronous( + modelDirectory: modelDirectory, modelName: configuration.name) self.pooler = loadPooling(modelDirectory: modelDirectory) let (tokenizerConfig, tokenizerData) = try await tokenizerConfigTask diff --git a/Libraries/Embedders/Load.swift b/Libraries/Embedders/Load.swift index 0ef5010d..e2f193b3 100644 --- a/Libraries/Embedders/Load.swift +++ b/Libraries/Embedders/Load.swift @@ -6,8 +6,48 @@ import MLX import MLXNN import Tokenizers -struct EmbedderError: Error { - let message: String +public enum EmbedderError: LocalizedError { + case unsupportedModelType(String) + case configurationFileError(String, String, Error) + case configurationDecodingError(String, String, DecodingError) + case missingTokenizerConfig + + public var errorDescription: String? { + switch self { + case .unsupportedModelType(let type): + return "Unsupported model type: \(type)" + case .configurationFileError(let file, let modelName, let error): + return "Error reading '\(file)' for model '\(modelName)': \(error.localizedDescription)" + case .configurationDecodingError(let file, let modelName, let decodingError): + let errorDetail = extractDecodingErrorDetail(decodingError) + return "Failed to parse \(file) for model '\(modelName)': \(errorDetail)" + case .missingTokenizerConfig: + return "Missing tokenizer configuration" + } + } + + private func extractDecodingErrorDetail(_ error: DecodingError) -> String { + switch error { + case .keyNotFound(let key, let context): + let path = (context.codingPath + [key]).map { $0.stringValue }.joined(separator: ".") + return "Missing field '\(path)'" + case .typeMismatch(_, let context): + let path = context.codingPath.map { $0.stringValue }.joined(separator: ".") + return "Type mismatch at '\(path)'" + case .valueNotFound(_, let context): + let path = context.codingPath.map { $0.stringValue }.joined(separator: ".") + return "Missing value at '\(path)'" + case .dataCorrupted(let context): + if context.codingPath.isEmpty { + return "Invalid JSON" + } else { + let path = context.codingPath.map { $0.stringValue }.joined(separator: ".") + return "Invalid data at '\(path)'" + } + @unknown default: + return error.localizedDescription + } + } } func prepareModelDirectory( @@ -53,20 +93,38 @@ public func load( // Start tokenizer loading asynchronously, then load model synchronously. // Both operations run in parallel because async let begins execution immediately. async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) - let model = try loadSynchronous(modelDirectory: modelDirectory) + let model = try loadSynchronous(modelDirectory: modelDirectory, modelName: configuration.name) let tokenizer = try await tokenizerTask return (model, tokenizer) } -func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel { +func loadSynchronous(modelDirectory: URL, modelName: String) throws -> EmbeddingModel { // Load config.json once and decode for both base config and model-specific config let configurationURL = modelDirectory.appending(component: "config.json") - let configData = try Data(contentsOf: configurationURL) - let baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) + let configData: Data + do { + configData = try Data(contentsOf: configurationURL) + } catch { + throw EmbedderError.configurationFileError( + configurationURL.lastPathComponent, modelName, error) + } + let baseConfig: BaseConfiguration + do { + baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) + } catch let error as DecodingError { + throw EmbedderError.configurationDecodingError( + configurationURL.lastPathComponent, modelName, error) + } let modelType = ModelType(rawValue: baseConfig.modelType) - let model = try modelType.createModel(configuration: configData) + let model: EmbeddingModel + do { + model = try modelType.createModel(configuration: configData) + } catch let error as DecodingError { + throw EmbedderError.configurationDecodingError( + configurationURL.lastPathComponent, modelName, error) + } // load the weights var weights = [String: MLXArray]() diff --git a/Libraries/Embedders/Tokenizer.swift b/Libraries/Embedders/Tokenizer.swift index 89abeb46..9d141ad6 100644 --- a/Libraries/Embedders/Tokenizer.swift +++ b/Libraries/Embedders/Tokenizer.swift @@ -45,7 +45,7 @@ func loadTokenizerConfig(configuration: ModelConfiguration, hub: HubApi) async t } guard let tokenizerConfig = try await config.tokenizerConfig else { - throw EmbedderError(message: "missing config") + throw EmbedderError.missingTokenizerConfig } let tokenizerData = try await config.tokenizerData return (tokenizerConfig, tokenizerData) diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift index 09d70e56..85fdff2a 100644 --- a/Libraries/MLXLLM/LLMModelFactory.swift +++ b/Libraries/MLXLLM/LLMModelFactory.swift @@ -480,9 +480,14 @@ public final class LLMModelFactory: ModelFactory { // Load config.json once and decode for both base config and model-specific config let configurationURL = modelDirectory.appending(component: "config.json") let configData: Data - let baseConfig: BaseConfiguration do { configData = try Data(contentsOf: configurationURL) + } catch { + throw ModelFactoryError.configurationFileError( + configurationURL.lastPathComponent, configuration.name, error) + } + let baseConfig: BaseConfiguration + do { baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) } catch let error as DecodingError { throw ModelFactoryError.configurationDecodingError( diff --git a/Libraries/MLXLMCommon/ModelFactory.swift b/Libraries/MLXLMCommon/ModelFactory.swift index a0c24014..a2a6da7d 100644 --- a/Libraries/MLXLMCommon/ModelFactory.swift +++ b/Libraries/MLXLMCommon/ModelFactory.swift @@ -7,6 +7,7 @@ import Tokenizers public enum ModelFactoryError: LocalizedError { case unsupportedModelType(String) case unsupportedProcessorType(String) + case configurationFileError(String, String, Error) case configurationDecodingError(String, String, DecodingError) case noModelFactoryAvailable @@ -16,6 +17,8 @@ public enum ModelFactoryError: LocalizedError { return "Unsupported model type: \(type)" case .unsupportedProcessorType(let type): return "Unsupported processor type: \(type)" + case .configurationFileError(let file, let modelName, let error): + return "Error reading '\(file)' for model '\(modelName)': \(error.localizedDescription)" case .noModelFactoryAvailable: return "No model factory available via ModelFactoryRegistry" case .configurationDecodingError(let file, let modelName, let decodingError): diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift index b27786d1..056cd862 100644 --- a/Libraries/MLXVLM/VLMModelFactory.swift +++ b/Libraries/MLXVLM/VLMModelFactory.swift @@ -259,9 +259,14 @@ public final class VLMModelFactory: ModelFactory { // Load config.json once and decode for both base config and model-specific config let configurationURL = modelDirectory.appending(component: "config.json") let configData: Data - let baseConfig: BaseConfiguration do { configData = try Data(contentsOf: configurationURL) + } catch { + throw ModelFactoryError.configurationFileError( + configurationURL.lastPathComponent, configuration.name, error) + } + let baseConfig: BaseConfiguration + do { baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData) } catch let error as DecodingError { throw ModelFactoryError.configurationDecodingError( @@ -287,7 +292,18 @@ public final class VLMModelFactory: ModelFactory { perLayerQuantization: baseConfig.perLayerQuantization) let tokenizer = try await tokenizerTask - let (processorConfigData, baseProcessorConfig) = try await processorConfigTask + let processorConfigData: Data + let baseProcessorConfig: BaseProcessorConfiguration + do { + (processorConfigData, baseProcessorConfig) = try await processorConfigTask + } catch let error as ProcessorConfigError { + if let decodingError = error.underlying as? DecodingError { + throw ModelFactoryError.configurationDecodingError( + error.filename, configuration.name, decodingError) + } + throw ModelFactoryError.configurationFileError( + error.filename, configuration.name, error.underlying) + } // Override processor type based on model type for models that need special handling // Mistral3 models ship with "PixtralProcessor" in their config but need Mistral3Processor @@ -308,7 +324,14 @@ public final class VLMModelFactory: ModelFactory { } +/// Error wrapper that includes the filename for better error messages. +private struct ProcessorConfigError: Error { + let filename: String + let underlying: Error +} + /// Loads processor configuration, preferring preprocessor_config.json over processor_config.json. +/// Throws ProcessorConfigError wrapping any underlying error with the filename. private func loadProcessorConfig(from modelDirectory: URL) async throws -> ( Data, BaseProcessorConfiguration ) { @@ -318,9 +341,13 @@ private func loadProcessorConfig(from modelDirectory: URL) async throws -> ( FileManager.default.fileExists(atPath: preprocessorConfigURL.path) ? preprocessorConfigURL : processorConfigURL - let data = try Data(contentsOf: url) - let config = try JSONDecoder().decode(BaseProcessorConfiguration.self, from: data) - return (data, config) + do { + let data = try Data(contentsOf: url) + let config = try JSONDecoder().decode(BaseProcessorConfiguration.self, from: data) + return (data, config) + } catch { + throw ProcessorConfigError(filename: url.lastPathComponent, underlying: error) + } } public class TrampolineModelFactory: NSObject, ModelFactoryTrampoline { From 12c48ac9ec97080afb819b6266eec14bc7994348 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Tue, 6 Jan 2026 20:52:00 +0100 Subject: [PATCH 4/4] Clarify parallelism in comments --- Libraries/Embedders/EmbeddingModel.swift | 3 +-- Libraries/Embedders/Load.swift | 3 +-- Libraries/MLXLLM/LLMModelFactory.swift | 3 +-- Libraries/MLXVLM/VLMModelFactory.swift | 7 +++++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Libraries/Embedders/EmbeddingModel.swift b/Libraries/Embedders/EmbeddingModel.swift index 67db2d7c..d7b59103 100644 --- a/Libraries/Embedders/EmbeddingModel.swift +++ b/Libraries/Embedders/EmbeddingModel.swift @@ -46,8 +46,7 @@ public actor ModelContainer { public init( hub: HubApi, modelDirectory: URL, configuration: ModelConfiguration ) async throws { - // Start tokenizer config loading asynchronously, then load model synchronously. - // Both operations run in parallel because async let begins execution immediately. + // Load tokenizer config and model in parallel using async let. async let tokenizerConfigTask = loadTokenizerConfig( configuration: configuration, hub: hub) diff --git a/Libraries/Embedders/Load.swift b/Libraries/Embedders/Load.swift index e2f193b3..8ee9494f 100644 --- a/Libraries/Embedders/Load.swift +++ b/Libraries/Embedders/Load.swift @@ -90,8 +90,7 @@ public func load( let modelDirectory = try await prepareModelDirectory( hub: hub, configuration: configuration, progressHandler: progressHandler) - // Start tokenizer loading asynchronously, then load model synchronously. - // Both operations run in parallel because async let begins execution immediately. + // Load tokenizer and model in parallel using async let. async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) let model = try loadSynchronous(modelDirectory: modelDirectory, modelName: configuration.name) let tokenizer = try await tokenizerTask diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift index 85fdff2a..7b2059b0 100644 --- a/Libraries/MLXLLM/LLMModelFactory.swift +++ b/Libraries/MLXLLM/LLMModelFactory.swift @@ -503,8 +503,7 @@ public final class LLMModelFactory: ModelFactory { configurationURL.lastPathComponent, configuration.name, error) } - // Start tokenizer loading asynchronously, then load weights synchronously. - // Both operations run in parallel because async let begins execution immediately. + // Load tokenizer and weights in parallel using async let. async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) try loadWeights( diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift index 056cd862..7a24085a 100644 --- a/Libraries/MLXVLM/VLMModelFactory.swift +++ b/Libraries/MLXVLM/VLMModelFactory.swift @@ -282,8 +282,10 @@ public final class VLMModelFactory: ModelFactory { configurationURL.lastPathComponent, configuration.name, error) } - // Start tokenizer and processor config loading asynchronously, then load weights synchronously. - // All three operations run in parallel because async let begins execution immediately. + // Load tokenizer, processor config, and weights in parallel using async let. + // Note: loadProcessorConfig does synchronous I/O but is marked async to enable + // parallel scheduling. This may briefly block a cooperative thread pool thread, + // but the config file is small and model loading is not a high-concurrency path. async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub) async let processorConfigTask = loadProcessorConfig(from: modelDirectory) @@ -331,6 +333,7 @@ private struct ProcessorConfigError: Error { } /// Loads processor configuration, preferring preprocessor_config.json over processor_config.json. +/// Marked async to enable parallel scheduling via async let, though the underlying I/O is synchronous. /// Throws ProcessorConfigError wrapping any underlying error with the filename. private func loadProcessorConfig(from modelDirectory: URL) async throws -> ( Data, BaseProcessorConfiguration