From aa562fc55cc5eb85a5973d79dc4959eee78d1994 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sun, 28 Dec 2025 02:41:30 +0100
Subject: [PATCH 1/4] Add model loading benchmarks

---
 Package.swift                                 |  12 ++
 Tests/Benchmarks/ModelLoadingBenchmarks.swift | 112 ++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 Tests/Benchmarks/ModelLoadingBenchmarks.swift

diff --git a/Package.swift b/Package.swift
index b00ec470..9cb51a20 100644
--- a/Package.swift
+++ b/Package.swift
@@ -104,6 +104,18 @@ let package = Package(
                 .enableExperimentalFeature("StrictConcurrency")
             ]
         ),
+        .testTarget(
+            name: "Benchmarks",
+            dependencies: [
+                "MLXLLM",
+                "MLXVLM",
+                "MLXLMCommon",
+            ],
+            path: "Tests/Benchmarks",
+            swiftSettings: [
+                .enableExperimentalFeature("StrictConcurrency")
+            ]
+        ),
         .target(
             name: "MLXEmbedders",
             dependencies: [
diff --git a/Tests/Benchmarks/ModelLoadingBenchmarks.swift b/Tests/Benchmarks/ModelLoadingBenchmarks.swift
new file mode 100644
index 00000000..a2dfc714
--- /dev/null
+++ b/Tests/Benchmarks/ModelLoadingBenchmarks.swift
@@ -0,0 +1,112 @@
+import Foundation
+import Hub
+import MLX
+import MLXLLM
+import MLXLMCommon
+import MLXVLM
+import Testing
+
+private let benchmarksEnabled = ProcessInfo.processInfo.environment["RUN_BENCHMARKS"] != nil
+
+private struct BenchmarkStats {
+    let mean: Double
+    let median: Double
+    let stdDev: Double
+    let min: Double
+    let max: Double
+
+    init(times: [Double]) {
+        precondition(!times.isEmpty, "BenchmarkStats requires at least one timing measurement")
+        let sorted = times.sorted()
+        self.min = sorted.first ?? 0
+        self.max = sorted.last ?? 0
+        let mean = times.reduce(0, +) / Double(times.count)
+        self.mean = mean
+        self.median = sorted[sorted.count / 2]
+
+        let squaredDiffs = times.map { ($0 - mean) * ($0 - mean) }
+        self.stdDev = sqrt(squaredDiffs.reduce(0, +) / Double(times.count))
+    }
+
+    func printSummary(label: String) {
+        print("\(label) results:")
+        print("  Mean:   \(String(format: "%.0f", mean))ms")
+        print("  Median: \(String(format: "%.0f", median))ms")
+        print("  StdDev: \(String(format: "%.1f", stdDev))ms")
+        print("  Range:  \(String(format: "%.0f", min))-\(String(format: "%.0f", max))ms")
+    }
+}
+
+@Suite(.serialized)
+struct ModelLoadingBenchmarks {
+
+    /// Benchmark LLM model loading
+    /// Tests: parallel tokenizer/weights, single config.json read
+    @Test(.enabled(if: benchmarksEnabled))
+    func loadLLM() async throws {
+        let modelId = "mlx-community/Qwen3-0.6B-4bit"
+        let hub = HubApi()
+        let config = ModelConfiguration(id: modelId)
+
+        // Warm-up run: ensure model is downloaded and caches are primed
+        _ = try await LLMModelFactory.shared.load(hub: hub, configuration: config) { _ in }
+        GPU.clearCache()
+
+        // Benchmark multiple runs
+        let runs = 7
+        var times: [Double] = []
+
+        for i in 1 ... runs {
+            let start = CFAbsoluteTimeGetCurrent()
+
+            _ = try await LLMModelFactory.shared.load(
+                hub: hub,
+                configuration: config
+            ) { _ in }
+
+            let elapsed = (CFAbsoluteTimeGetCurrent() - start) * 1000
+            times.append(elapsed)
+            print("LLM load run \(i): \(String(format: "%.0f", elapsed))ms")
+
+            // Clear GPU cache to ensure independent measurements
+            GPU.clearCache()
+        }
+
+        BenchmarkStats(times: times).printSummary(label: "LLM load")
+    }
+
+    /// Benchmark VLM model loading
+    /// Tests: parallel tokenizer/weights, single config.json read, parallel processor config
+    @Test(.enabled(if: benchmarksEnabled))
+    func loadVLM() async throws {
+        let modelId = "mlx-community/Qwen2-VL-2B-Instruct-4bit"
+        let hub = HubApi()
+        let config = ModelConfiguration(id: modelId)
+
+        // Warm-up run: ensure model is downloaded and caches are primed
+        _ = try await VLMModelFactory.shared.load(hub: hub, configuration: config) { _ in }
+        GPU.clearCache()
+
+        // Benchmark multiple runs
+        let runs = 7
+        var times: [Double] = []
+
+        for i in 1 ... runs {
+            let start = CFAbsoluteTimeGetCurrent()
+
+            _ = try await VLMModelFactory.shared.load(
+                hub: hub,
+                configuration: config
+            ) { _ in }
+
+            let elapsed = (CFAbsoluteTimeGetCurrent() - start) * 1000
+            times.append(elapsed)
+            print("VLM load run \(i): \(String(format: "%.0f", elapsed))ms")
+
+            // Clear GPU cache to ensure independent measurements
+            GPU.clearCache()
+        }
+
+        BenchmarkStats(times: times).printSummary(label: "VLM load")
+    }
+}

From c5725ae2c8ffbe23d15207939b56365a02e99c1e Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sun, 28 Dec 2025 00:39:22 +0100
Subject: [PATCH 2/4] Parallelize loading of weights, tokenizer, and processor
 config

---
 Libraries/Embedders/Configuration.swift       | 64 ++++++---------
 Libraries/Embedders/EmbeddingModel.swift      | 10 ++-
 Libraries/Embedders/Load.swift                | 14 ++--
 Libraries/MLXLLM/LLMModelFactory.swift        | 26 ++++---
 .../Registries/ModelTypeRegistry.swift        | 11 +--
 .../Registries/ProcessorTypeRegistry.swift    | 10 +--
 Libraries/MLXVLM/VLMModelFactory.swift        | 78 +++++++++----------
 7 files changed, 100 insertions(+), 113 deletions(-)

diff --git a/Libraries/Embedders/Configuration.swift b/Libraries/Embedders/Configuration.swift
index 90517291..228babde 100644
--- a/Libraries/Embedders/Configuration.swift
+++ b/Libraries/Embedders/Configuration.swift
@@ -33,60 +33,42 @@ private class ModelTypeRegistry: @unchecked Sendable {
     // to remain synchronous.
     private let lock = NSLock()
 
-    private var creators: [String: @Sendable (URL) throws -> EmbeddingModel] = [
-        "bert": {
-            url in
-            let configuration = try JSONDecoder().decode(
-                BertConfiguration.self, from: Data(contentsOf: url))
-            let model = BertModel(configuration)
-            return model
+    private var creators: [String: @Sendable (Data) throws -> EmbeddingModel] = [
+        "bert": { data in
+            let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data)
+            return BertModel(configuration)
         },
-        "roberta": {
-            url in
-            let configuration = try JSONDecoder().decode(
-                BertConfiguration.self, from: Data(contentsOf: url))
-            let model = BertModel(configuration)
-            return model
+        "roberta": { data in
+            let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data)
+            return BertModel(configuration)
         },
-        "xlm-roberta": {
-            url in
-            let configuration = try JSONDecoder().decode(
-                BertConfiguration.self, from: Data(contentsOf: url))
-            let model = BertModel(configuration)
-            return model
+        "xlm-roberta": { data in
+            let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data)
+            return BertModel(configuration)
         },
-        "distilbert": {
-            url in
-            let configuration = try JSONDecoder().decode(
-                BertConfiguration.self, from: Data(contentsOf: url))
-            let model = BertModel(configuration)
-            return model
+        "distilbert": { data in
+            let configuration = try JSONDecoder().decode(BertConfiguration.self, from: data)
+            return BertModel(configuration)
         },
-        "nomic_bert": {
-            url in
-            let configuration = try JSONDecoder().decode(
-                NomicBertConfiguration.self, from: Data(contentsOf: url))
-            let model = NomicBertModel(configuration, pooler: false)
-            return model
+        "nomic_bert": { data in
+            let configuration = try JSONDecoder().decode(NomicBertConfiguration.self, from: data)
+            return NomicBertModel(configuration, pooler: false)
         },
-        "qwen3": {
-            url in
-            let configuration = try JSONDecoder().decode(
-                Qwen3Configuration.self, from: Data(contentsOf: url))
-            let model = Qwen3Model(configuration)
-            return model
+        "qwen3": { data in
+            let configuration = try JSONDecoder().decode(Qwen3Configuration.self, from: data)
+            return Qwen3Model(configuration)
         },
     ]
 
     public func registerModelType(
-        _ type: String, creator: @Sendable @escaping (URL) throws -> EmbeddingModel
+        _ type: String, creator: @Sendable @escaping (Data) throws -> EmbeddingModel
     ) {
         lock.withLock {
             creators[type] = creator
         }
     }
 
-    public func createModel(configuration: URL, rawValue: String) throws -> EmbeddingModel {
+    public func createModel(configuration: Data, rawValue: String) throws -> EmbeddingModel {
         let creator = lock.withLock {
             creators[rawValue]
         }
@@ -108,12 +90,12 @@ public struct ModelType: RawRepresentable, Codable, Sendable {
     }
 
     public static func registerModelType(
-        _ type: String, creator: @Sendable @escaping (URL) throws -> EmbeddingModel
+        _ type: String, creator: @Sendable @escaping (Data) throws -> EmbeddingModel
     ) {
         modelTypeRegistry.registerModelType(type, creator: creator)
     }
 
-    public func createModel(configuration: URL) throws -> EmbeddingModel {
+    public func createModel(configuration: Data) throws -> EmbeddingModel {
         try modelTypeRegistry.createModel(configuration: configuration, rawValue: rawValue)
     }
 }
diff --git a/Libraries/Embedders/EmbeddingModel.swift b/Libraries/Embedders/EmbeddingModel.swift
index 3c4fbed7..1513387f 100644
--- a/Libraries/Embedders/EmbeddingModel.swift
+++ b/Libraries/Embedders/EmbeddingModel.swift
@@ -46,13 +46,17 @@ public actor ModelContainer {
     public init(
         hub: HubApi, modelDirectory: URL, configuration: ModelConfiguration
     ) async throws {
+        // Start tokenizer config loading asynchronously, then load model synchronously.
+        // Both operations run in parallel because async let begins execution immediately.
+        async let tokenizerConfigTask = loadTokenizerConfig(
+            configuration: configuration, hub: hub)
+
         self.model = try loadSynchronous(modelDirectory: modelDirectory)
+        self.pooler = loadPooling(modelDirectory: modelDirectory)
 
-        let (tokenizerConfig, tokenizerData) = try await loadTokenizerConfig(
-            configuration: configuration, hub: hub)
+        let (tokenizerConfig, tokenizerData) = try await tokenizerConfigTask
         self.tokenizer = try PreTrainedTokenizer(
             tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
-        self.pooler = loadPooling(modelDirectory: modelDirectory)  //?? Pooling(strategy: .none)
     }
 
     /// Perform an action on the model and/or tokenizer. Callers _must_ eval any `MLXArray` before returning as
diff --git a/Libraries/Embedders/Load.swift b/Libraries/Embedders/Load.swift
index 488f3a34..0ef5010d 100644
--- a/Libraries/Embedders/Load.swift
+++ b/Libraries/Embedders/Load.swift
@@ -49,20 +49,24 @@ public func load(
 ) async throws -> (EmbeddingModel, Tokenizer) {
     let modelDirectory = try await prepareModelDirectory(
         hub: hub, configuration: configuration, progressHandler: progressHandler)
+
+    // Start tokenizer loading asynchronously, then load model synchronously.
+    // Both operations run in parallel because async let begins execution immediately.
+    async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
     let model = try loadSynchronous(modelDirectory: modelDirectory)
-    let tokenizer = try await loadTokenizer(configuration: configuration, hub: hub)
+    let tokenizer = try await tokenizerTask
 
     return (model, tokenizer)
 }
 
 func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel {
-    // create the model (no weights loaded)
+    // Load config.json once and decode for both base config and model-specific config
     let configurationURL = modelDirectory.appending(component: "config.json")
-    let baseConfig = try JSONDecoder().decode(
-        BaseConfiguration.self, from: Data(contentsOf: configurationURL))
+    let configData = try Data(contentsOf: configurationURL)
+    let baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
 
     let modelType = ModelType(rawValue: baseConfig.modelType)
-    let model = try modelType.createModel(configuration: configurationURL)
+    let model = try modelType.createModel(configuration: configData)
 
     // load the weights
     var weights = [String: MLXArray]()
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
index c6f105cd..09d70e56 100644
--- a/Libraries/MLXLLM/LLMModelFactory.swift
+++ b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -6,13 +6,12 @@ import MLX
 import MLXLMCommon
 import Tokenizers
 
-/// Creates a function that loads a configuration file and instantiates a model with the proper configuration
+/// Creates a function that decodes configuration data and instantiates a model with the proper configuration
 private func create<C: Codable, M>(
     _ configurationType: C.Type, _ modelInit: @escaping (C) -> M
-) -> (URL) throws -> M {
-    { url in
-        let configuration = try JSONDecoder().decode(
-            C.self, from: Data(contentsOf: url))
+) -> (Data) throws -> M {
+    { data in
+        let configuration = try JSONDecoder().decode(C.self, from: data)
         return modelInit(configuration)
     }
 }
@@ -478,13 +477,13 @@ public final class LLMModelFactory: ModelFactory {
         let modelDirectory = try await downloadModel(
             hub: hub, configuration: configuration, progressHandler: progressHandler)
 
-        // Load the generic config to understand which model and how to load the weights
+        // Load config.json once and decode for both base config and model-specific config
         let configurationURL = modelDirectory.appending(component: "config.json")
-
+        let configData: Data
         let baseConfig: BaseConfiguration
         do {
-            baseConfig = try JSONDecoder().decode(
-                BaseConfiguration.self, from: Data(contentsOf: configurationURL))
+            configData = try Data(contentsOf: configurationURL)
+            baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
                 configurationURL.lastPathComponent, configuration.name, error)
@@ -493,18 +492,21 @@ public final class LLMModelFactory: ModelFactory {
         let model: LanguageModel
         do {
             model = try await typeRegistry.createModel(
-                configuration: configurationURL, modelType: baseConfig.modelType)
+                configuration: configData, modelType: baseConfig.modelType)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
                 configurationURL.lastPathComponent, configuration.name, error)
         }
 
-        // apply the weights to the bare model
+        // Start tokenizer loading asynchronously, then load weights synchronously.
+        // Both operations run in parallel because async let begins execution immediately.
+        async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
+
         try loadWeights(
             modelDirectory: modelDirectory, model: model,
             perLayerQuantization: baseConfig.perLayerQuantization)
 
-        let tokenizer = try await loadTokenizer(configuration: configuration, hub: hub)
+        let tokenizer = try await tokenizerTask
 
         let messageGenerator =
             if let model = model as? LLMModel {
diff --git a/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift b/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift
index 8fb2bcaf..a610f5e0 100644
--- a/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift
+++ b/Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift
@@ -10,21 +10,22 @@ public actor ModelTypeRegistry {
     }
 
     /// Creates a registry with given creators.
-    public init(creators: [String: (URL) throws -> any LanguageModel]) {
+    public init(creators: [String: (Data) throws -> any LanguageModel]) {
         self.creators = creators
     }
 
-    private var creators: [String: (URL) throws -> any LanguageModel]
+    private var creators: [String: (Data) throws -> any LanguageModel]
 
     /// Add a new model to the type registry.
     public func registerModelType(
-        _ type: String, creator: @escaping (URL) throws -> any LanguageModel
+        _ type: String, creator: @escaping (Data) throws -> any LanguageModel
     ) {
         creators[type] = creator
     }
 
-    /// Given a `modelType` and configuration file instantiate a new `LanguageModel`.
-    public func createModel(configuration: URL, modelType: String) throws -> sending LanguageModel {
+    /// Given a `modelType` and configuration data instantiate a new `LanguageModel`.
+    public func createModel(configuration: Data, modelType: String) throws -> sending LanguageModel
+    {
         guard let creator = creators[modelType] else {
             throw ModelFactoryError.unsupportedModelType(modelType)
         }
diff --git a/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift b/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift
index 6c4c91eb..67d1491e 100644
--- a/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift
+++ b/Libraries/MLXLMCommon/Registries/ProcessorTypeRegistry.swift
@@ -11,26 +11,26 @@ public actor ProcessorTypeRegistry {
     }
 
     /// Creates a registry with given creators.
-    public init(creators: [String: (URL, any Tokenizer) throws -> any UserInputProcessor]) {
+    public init(creators: [String: (Data, any Tokenizer) throws -> any UserInputProcessor]) {
         self.creators = creators
     }
 
-    private var creators: [String: (URL, any Tokenizer) throws -> any UserInputProcessor]
+    private var creators: [String: (Data, any Tokenizer) throws -> any UserInputProcessor]
 
     /// Add a new model to the type registry.
     public func registerProcessorType(
         _ type: String,
         creator:
             @escaping (
-                URL,
+                Data,
                 any Tokenizer
             ) throws -> any UserInputProcessor
     ) {
         creators[type] = creator
     }
 
-    /// Given a `processorType` and configuration file instantiate a new `UserInputProcessor`.
-    public func createModel(configuration: URL, processorType: String, tokenizer: any Tokenizer)
+    /// Given a `processorType` and configuration data instantiate a new `UserInputProcessor`.
+    public func createModel(configuration: Data, processorType: String, tokenizer: any Tokenizer)
         throws -> sending any UserInputProcessor
     {
         guard let creator = creators[processorType] else {
diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
index c047a2e2..b27786d1 100644
--- a/Libraries/MLXVLM/VLMModelFactory.swift
+++ b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -48,10 +48,9 @@ public struct BaseProcessorConfiguration: Codable, Sendable {
 /// Creates a function that loads a configuration file and instantiates a model with the proper configuration
 private func create<C: Codable, M>(
     _ configurationType: C.Type, _ modelInit: @escaping (C) -> M
-) -> (URL) throws -> M {
-    { url in
-        let configuration = try JSONDecoder().decode(
-            C.self, from: Data(contentsOf: url))
+) -> (Data) throws -> M {
+    { data in
+        let configuration = try JSONDecoder().decode(C.self, from: data)
         return modelInit(configuration)
     }
 }
@@ -63,10 +62,9 @@ private func create<C: Codable, P>(
             C,
             any Tokenizer
         ) -> P
-) -> (URL, any Tokenizer) throws -> P {
-    { url, tokenizer in
-        let configuration = try JSONDecoder().decode(
-            C.self, from: Data(contentsOf: url))
+) -> (Data, any Tokenizer) throws -> P {
+    { data, tokenizer in
+        let configuration = try JSONDecoder().decode(C.self, from: data)
         return processorInit(configuration, tokenizer)
     }
 }
@@ -258,15 +256,13 @@ public final class VLMModelFactory: ModelFactory {
         let modelDirectory = try await downloadModel(
             hub: hub, configuration: configuration, progressHandler: progressHandler)
 
-        // load the generic config to understand which model and how to load the weights
-        let configurationURL = modelDirectory.appending(
-            component: "config.json"
-        )
-
+        // Load config.json once and decode for both base config and model-specific config
+        let configurationURL = modelDirectory.appending(component: "config.json")
+        let configData: Data
         let baseConfig: BaseConfiguration
         do {
-            baseConfig = try JSONDecoder().decode(
-                BaseConfiguration.self, from: Data(contentsOf: configurationURL))
+            configData = try Data(contentsOf: configurationURL)
+            baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
                 configurationURL.lastPathComponent, configuration.name, error)
@@ -275,43 +271,26 @@ public final class VLMModelFactory: ModelFactory {
         let model: LanguageModel
         do {
             model = try await typeRegistry.createModel(
-                configuration: configurationURL, modelType: baseConfig.modelType)
+                configuration: configData, modelType: baseConfig.modelType)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
                 configurationURL.lastPathComponent, configuration.name, error)
         }
 
-        // apply the weights to the bare model
+        // Start tokenizer and processor config loading asynchronously, then load weights synchronously.
+        // All three operations run in parallel because async let begins execution immediately.
+        async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
+        async let processorConfigTask = loadProcessorConfig(from: modelDirectory)
+
         try loadWeights(
             modelDirectory: modelDirectory, model: model,
             perLayerQuantization: baseConfig.perLayerQuantization)
 
-        let tokenizer = try await loadTokenizer(
-            configuration: configuration,
-            hub: hub
-        )
-
-        // Support both processor_config.json and preprocessor_config.json (prefer preprocessor_config.json)
-        let processorConfigURL = modelDirectory.appending(component: "processor_config.json")
-        let preprocessorConfigURL = modelDirectory.appending(component: "preprocessor_config.json")
-        let processorConfigurationURL =
-            FileManager.default.fileExists(atPath: preprocessorConfigURL.path)
-            ? preprocessorConfigURL
-            : processorConfigURL
-
-        let baseProcessorConfig: BaseProcessorConfiguration
-        do {
-            baseProcessorConfig = try JSONDecoder().decode(
-                BaseProcessorConfiguration.self,
-                from: Data(contentsOf: processorConfigurationURL)
-            )
-        } catch let error as DecodingError {
-            throw ModelFactoryError.configurationDecodingError(
-                processorConfigurationURL.lastPathComponent, configuration.name, error)
-        }
+        let tokenizer = try await tokenizerTask
+        let (processorConfigData, baseProcessorConfig) = try await processorConfigTask
 
         // Override processor type based on model type for models that need special handling
-        // Mistral3 model ship with "PixtralProcessor" in their config but need Mistral3Processor
+        // Mistral3 models ship with "PixtralProcessor" in their config but need Mistral3Processor
         // to handle spatial merging correctly
         let processorTypeOverrides: [String: String] = [
             "mistral3": "Mistral3Processor"
@@ -320,7 +299,7 @@ public final class VLMModelFactory: ModelFactory {
             processorTypeOverrides[baseConfig.modelType] ?? baseProcessorConfig.processorClass
 
         let processor = try await processorRegistry.createModel(
-            configuration: processorConfigurationURL,
+            configuration: processorConfigData,
             processorType: processorType, tokenizer: tokenizer)
 
         return .init(
@@ -329,6 +308,21 @@ public final class VLMModelFactory: ModelFactory {
 
 }
 
+/// Loads processor configuration, preferring preprocessor_config.json over processor_config.json.
+private func loadProcessorConfig(from modelDirectory: URL) async throws -> (
+    Data, BaseProcessorConfiguration
+) {
+    let processorConfigURL = modelDirectory.appending(component: "processor_config.json")
+    let preprocessorConfigURL = modelDirectory.appending(component: "preprocessor_config.json")
+    let url =
+        FileManager.default.fileExists(atPath: preprocessorConfigURL.path)
+        ? preprocessorConfigURL
+        : processorConfigURL
+    let data = try Data(contentsOf: url)
+    let config = try JSONDecoder().decode(BaseProcessorConfiguration.self, from: data)
+    return (data, config)
+}
+
 public class TrampolineModelFactory: NSObject, ModelFactoryTrampoline {
     public static func modelFactory() -> (any MLXLMCommon.ModelFactory)? {
         VLMModelFactory.shared

From 58dbb8e33350e1478bbfc923788a9bed3d9948e8 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Mon, 29 Dec 2025 09:53:11 +0100
Subject: [PATCH 3/4] Improve error handling

---
 Libraries/Embedders/Configuration.swift  |  2 +-
 Libraries/Embedders/EmbeddingModel.swift |  3 +-
 Libraries/Embedders/Load.swift           | 72 +++++++++++++++++++++---
 Libraries/Embedders/Tokenizer.swift      |  2 +-
 Libraries/MLXLLM/LLMModelFactory.swift   |  7 ++-
 Libraries/MLXLMCommon/ModelFactory.swift |  3 +
 Libraries/MLXVLM/VLMModelFactory.swift   | 37 ++++++++++--
 7 files changed, 110 insertions(+), 16 deletions(-)

diff --git a/Libraries/Embedders/Configuration.swift b/Libraries/Embedders/Configuration.swift
index 228babde..3a20c91c 100644
--- a/Libraries/Embedders/Configuration.swift
+++ b/Libraries/Embedders/Configuration.swift
@@ -73,7 +73,7 @@ private class ModelTypeRegistry: @unchecked Sendable {
             creators[rawValue]
         }
         guard let creator else {
-            throw EmbedderError(message: "Unsupported model type.")
+            throw EmbedderError.unsupportedModelType(rawValue)
         }
         return try creator(configuration)
     }
diff --git a/Libraries/Embedders/EmbeddingModel.swift b/Libraries/Embedders/EmbeddingModel.swift
index 1513387f..67db2d7c 100644
--- a/Libraries/Embedders/EmbeddingModel.swift
+++ b/Libraries/Embedders/EmbeddingModel.swift
@@ -51,7 +51,8 @@ public actor ModelContainer {
         async let tokenizerConfigTask = loadTokenizerConfig(
             configuration: configuration, hub: hub)
 
-        self.model = try loadSynchronous(modelDirectory: modelDirectory)
+        self.model = try loadSynchronous(
+            modelDirectory: modelDirectory, modelName: configuration.name)
         self.pooler = loadPooling(modelDirectory: modelDirectory)
 
         let (tokenizerConfig, tokenizerData) = try await tokenizerConfigTask
diff --git a/Libraries/Embedders/Load.swift b/Libraries/Embedders/Load.swift
index 0ef5010d..e2f193b3 100644
--- a/Libraries/Embedders/Load.swift
+++ b/Libraries/Embedders/Load.swift
@@ -6,8 +6,48 @@ import MLX
 import MLXNN
 import Tokenizers
 
-struct EmbedderError: Error {
-    let message: String
+public enum EmbedderError: LocalizedError {
+    case unsupportedModelType(String)
+    case configurationFileError(String, String, Error)
+    case configurationDecodingError(String, String, DecodingError)
+    case missingTokenizerConfig
+
+    public var errorDescription: String? {
+        switch self {
+        case .unsupportedModelType(let type):
+            return "Unsupported model type: \(type)"
+        case .configurationFileError(let file, let modelName, let error):
+            return "Error reading '\(file)' for model '\(modelName)': \(error.localizedDescription)"
+        case .configurationDecodingError(let file, let modelName, let decodingError):
+            let errorDetail = extractDecodingErrorDetail(decodingError)
+            return "Failed to parse \(file) for model '\(modelName)': \(errorDetail)"
+        case .missingTokenizerConfig:
+            return "Missing tokenizer configuration"
+        }
+    }
+
+    private func extractDecodingErrorDetail(_ error: DecodingError) -> String {
+        switch error {
+        case .keyNotFound(let key, let context):
+            let path = (context.codingPath + [key]).map { $0.stringValue }.joined(separator: ".")
+            return "Missing field '\(path)'"
+        case .typeMismatch(_, let context):
+            let path = context.codingPath.map { $0.stringValue }.joined(separator: ".")
+            return "Type mismatch at '\(path)'"
+        case .valueNotFound(_, let context):
+            let path = context.codingPath.map { $0.stringValue }.joined(separator: ".")
+            return "Missing value at '\(path)'"
+        case .dataCorrupted(let context):
+            if context.codingPath.isEmpty {
+                return "Invalid JSON"
+            } else {
+                let path = context.codingPath.map { $0.stringValue }.joined(separator: ".")
+                return "Invalid data at '\(path)'"
+            }
+        @unknown default:
+            return error.localizedDescription
+        }
+    }
 }
 
 func prepareModelDirectory(
@@ -53,20 +93,38 @@ public func load(
     // Start tokenizer loading asynchronously, then load model synchronously.
     // Both operations run in parallel because async let begins execution immediately.
     async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
-    let model = try loadSynchronous(modelDirectory: modelDirectory)
+    let model = try loadSynchronous(modelDirectory: modelDirectory, modelName: configuration.name)
     let tokenizer = try await tokenizerTask
 
     return (model, tokenizer)
 }
 
-func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel {
+func loadSynchronous(modelDirectory: URL, modelName: String) throws -> EmbeddingModel {
     // Load config.json once and decode for both base config and model-specific config
     let configurationURL = modelDirectory.appending(component: "config.json")
-    let configData = try Data(contentsOf: configurationURL)
-    let baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
+    let configData: Data
+    do {
+        configData = try Data(contentsOf: configurationURL)
+    } catch {
+        throw EmbedderError.configurationFileError(
+            configurationURL.lastPathComponent, modelName, error)
+    }
+    let baseConfig: BaseConfiguration
+    do {
+        baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
+    } catch let error as DecodingError {
+        throw EmbedderError.configurationDecodingError(
+            configurationURL.lastPathComponent, modelName, error)
+    }
 
     let modelType = ModelType(rawValue: baseConfig.modelType)
-    let model = try modelType.createModel(configuration: configData)
+    let model: EmbeddingModel
+    do {
+        model = try modelType.createModel(configuration: configData)
+    } catch let error as DecodingError {
+        throw EmbedderError.configurationDecodingError(
+            configurationURL.lastPathComponent, modelName, error)
+    }
 
     // load the weights
     var weights = [String: MLXArray]()
diff --git a/Libraries/Embedders/Tokenizer.swift b/Libraries/Embedders/Tokenizer.swift
index 89abeb46..9d141ad6 100644
--- a/Libraries/Embedders/Tokenizer.swift
+++ b/Libraries/Embedders/Tokenizer.swift
@@ -45,7 +45,7 @@ func loadTokenizerConfig(configuration: ModelConfiguration, hub: HubApi) async t
     }
 
     guard let tokenizerConfig = try await config.tokenizerConfig else {
-        throw EmbedderError(message: "missing config")
+        throw EmbedderError.missingTokenizerConfig
     }
     let tokenizerData = try await config.tokenizerData
     return (tokenizerConfig, tokenizerData)
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
index 09d70e56..85fdff2a 100644
--- a/Libraries/MLXLLM/LLMModelFactory.swift
+++ b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -480,9 +480,14 @@ public final class LLMModelFactory: ModelFactory {
         // Load config.json once and decode for both base config and model-specific config
         let configurationURL = modelDirectory.appending(component: "config.json")
         let configData: Data
-        let baseConfig: BaseConfiguration
         do {
             configData = try Data(contentsOf: configurationURL)
+        } catch {
+            throw ModelFactoryError.configurationFileError(
+                configurationURL.lastPathComponent, configuration.name, error)
+        }
+        let baseConfig: BaseConfiguration
+        do {
             baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
diff --git a/Libraries/MLXLMCommon/ModelFactory.swift b/Libraries/MLXLMCommon/ModelFactory.swift
index a0c24014..a2a6da7d 100644
--- a/Libraries/MLXLMCommon/ModelFactory.swift
+++ b/Libraries/MLXLMCommon/ModelFactory.swift
@@ -7,6 +7,7 @@ import Tokenizers
 public enum ModelFactoryError: LocalizedError {
     case unsupportedModelType(String)
     case unsupportedProcessorType(String)
+    case configurationFileError(String, String, Error)
     case configurationDecodingError(String, String, DecodingError)
     case noModelFactoryAvailable
 
@@ -16,6 +17,8 @@ public enum ModelFactoryError: LocalizedError {
             return "Unsupported model type: \(type)"
         case .unsupportedProcessorType(let type):
             return "Unsupported processor type: \(type)"
+        case .configurationFileError(let file, let modelName, let error):
+            return "Error reading '\(file)' for model '\(modelName)': \(error.localizedDescription)"
         case .noModelFactoryAvailable:
             return "No model factory available via ModelFactoryRegistry"
         case .configurationDecodingError(let file, let modelName, let decodingError):
diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
index b27786d1..056cd862 100644
--- a/Libraries/MLXVLM/VLMModelFactory.swift
+++ b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -259,9 +259,14 @@ public final class VLMModelFactory: ModelFactory {
         // Load config.json once and decode for both base config and model-specific config
         let configurationURL = modelDirectory.appending(component: "config.json")
         let configData: Data
-        let baseConfig: BaseConfiguration
         do {
             configData = try Data(contentsOf: configurationURL)
+        } catch {
+            throw ModelFactoryError.configurationFileError(
+                configurationURL.lastPathComponent, configuration.name, error)
+        }
+        let baseConfig: BaseConfiguration
+        do {
             baseConfig = try JSONDecoder().decode(BaseConfiguration.self, from: configData)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
@@ -287,7 +292,18 @@ public final class VLMModelFactory: ModelFactory {
             perLayerQuantization: baseConfig.perLayerQuantization)
 
         let tokenizer = try await tokenizerTask
-        let (processorConfigData, baseProcessorConfig) = try await processorConfigTask
+        let processorConfigData: Data
+        let baseProcessorConfig: BaseProcessorConfiguration
+        do {
+            (processorConfigData, baseProcessorConfig) = try await processorConfigTask
+        } catch let error as ProcessorConfigError {
+            if let decodingError = error.underlying as? DecodingError {
+                throw ModelFactoryError.configurationDecodingError(
+                    error.filename, configuration.name, decodingError)
+            }
+            throw ModelFactoryError.configurationFileError(
+                error.filename, configuration.name, error.underlying)
+        }
 
         // Override processor type based on model type for models that need special handling
         // Mistral3 models ship with "PixtralProcessor" in their config but need Mistral3Processor
@@ -308,7 +324,14 @@ public final class VLMModelFactory: ModelFactory {
 
 }
 
+/// Error wrapper that includes the filename for better error messages.
+private struct ProcessorConfigError: Error {
+    let filename: String
+    let underlying: Error
+}
+
 /// Loads processor configuration, preferring preprocessor_config.json over processor_config.json.
+/// Throws ProcessorConfigError wrapping any underlying error with the filename.
 private func loadProcessorConfig(from modelDirectory: URL) async throws -> (
     Data, BaseProcessorConfiguration
 ) {
@@ -318,9 +341,13 @@ private func loadProcessorConfig(from modelDirectory: URL) async throws -> (
         FileManager.default.fileExists(atPath: preprocessorConfigURL.path)
         ? preprocessorConfigURL
         : processorConfigURL
-    let data = try Data(contentsOf: url)
-    let config = try JSONDecoder().decode(BaseProcessorConfiguration.self, from: data)
-    return (data, config)
+    do {
+        let data = try Data(contentsOf: url)
+        let config = try JSONDecoder().decode(BaseProcessorConfiguration.self, from: data)
+        return (data, config)
+    } catch {
+        throw ProcessorConfigError(filename: url.lastPathComponent, underlying: error)
+    }
 }
 
 public class TrampolineModelFactory: NSObject, ModelFactoryTrampoline {

From 12c48ac9ec97080afb819b6266eec14bc7994348 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Tue, 6 Jan 2026 20:52:00 +0100
Subject: [PATCH 4/4] Clarify parallelism in comments

---
 Libraries/Embedders/EmbeddingModel.swift | 3 +--
 Libraries/Embedders/Load.swift           | 3 +--
 Libraries/MLXLLM/LLMModelFactory.swift   | 3 +--
 Libraries/MLXVLM/VLMModelFactory.swift   | 7 +++++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Libraries/Embedders/EmbeddingModel.swift b/Libraries/Embedders/EmbeddingModel.swift
index 67db2d7c..d7b59103 100644
--- a/Libraries/Embedders/EmbeddingModel.swift
+++ b/Libraries/Embedders/EmbeddingModel.swift
@@ -46,8 +46,7 @@ public actor ModelContainer {
     public init(
         hub: HubApi, modelDirectory: URL, configuration: ModelConfiguration
     ) async throws {
-        // Start tokenizer config loading asynchronously, then load model synchronously.
-        // Both operations run in parallel because async let begins execution immediately.
+        // Load tokenizer config and model in parallel using async let.
         async let tokenizerConfigTask = loadTokenizerConfig(
             configuration: configuration, hub: hub)
 
diff --git a/Libraries/Embedders/Load.swift b/Libraries/Embedders/Load.swift
index e2f193b3..8ee9494f 100644
--- a/Libraries/Embedders/Load.swift
+++ b/Libraries/Embedders/Load.swift
@@ -90,8 +90,7 @@ public func load(
     let modelDirectory = try await prepareModelDirectory(
         hub: hub, configuration: configuration, progressHandler: progressHandler)
 
-    // Start tokenizer loading asynchronously, then load model synchronously.
-    // Both operations run in parallel because async let begins execution immediately.
+    // Load tokenizer and model in parallel using async let.
     async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
     let model = try loadSynchronous(modelDirectory: modelDirectory, modelName: configuration.name)
     let tokenizer = try await tokenizerTask
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
index 85fdff2a..7b2059b0 100644
--- a/Libraries/MLXLLM/LLMModelFactory.swift
+++ b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -503,8 +503,7 @@ public final class LLMModelFactory: ModelFactory {
                 configurationURL.lastPathComponent, configuration.name, error)
         }
 
-        // Start tokenizer loading asynchronously, then load weights synchronously.
-        // Both operations run in parallel because async let begins execution immediately.
+        // Load tokenizer and weights in parallel using async let.
         async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
 
         try loadWeights(
diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
index 056cd862..7a24085a 100644
--- a/Libraries/MLXVLM/VLMModelFactory.swift
+++ b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -282,8 +282,10 @@ public final class VLMModelFactory: ModelFactory {
                 configurationURL.lastPathComponent, configuration.name, error)
         }
 
-        // Start tokenizer and processor config loading asynchronously, then load weights synchronously.
-        // All three operations run in parallel because async let begins execution immediately.
+        // Load tokenizer, processor config, and weights in parallel using async let.
+        // Note: loadProcessorConfig does synchronous I/O but is marked async to enable
+        // parallel scheduling. This may briefly block a cooperative thread pool thread,
+        // but the config file is small and model loading is not a high-concurrency path.
         async let tokenizerTask = loadTokenizer(configuration: configuration, hub: hub)
         async let processorConfigTask = loadProcessorConfig(from: modelDirectory)
 
@@ -331,6 +333,7 @@ private struct ProcessorConfigError: Error {
 }
 
 /// Loads processor configuration, preferring preprocessor_config.json over processor_config.json.
+/// Marked async to enable parallel scheduling via async let, though the underlying I/O is synchronous.
 /// Throws ProcessorConfigError wrapping any underlying error with the filename.
 private func loadProcessorConfig(from modelDirectory: URL) async throws -> (
     Data, BaseProcessorConfiguration