Skip to content

Commit 54cf942

Browse files
committed
Add model loading benchmarks
1 parent d9f46e3 commit 54cf942

File tree

2 files changed

+95
-0
lines changed

2 files changed

+95
-0
lines changed

Package.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,18 @@ let package = Package(
104104
.enableExperimentalFeature("StrictConcurrency")
105105
]
106106
),
107+
.testTarget(
108+
name: "Benchmarks",
109+
dependencies: [
110+
"MLXLLM",
111+
"MLXVLM",
112+
"MLXLMCommon",
113+
],
114+
path: "Tests/Benchmarks",
115+
swiftSettings: [
116+
.enableExperimentalFeature("StrictConcurrency")
117+
]
118+
),
107119
.target(
108120
name: "MLXEmbedders",
109121
dependencies: [
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import Foundation
2+
import Hub
3+
import MLX
4+
import MLXLLM
5+
import MLXLMCommon
6+
import MLXVLM
7+
import Testing
8+
9+
private let benchmarksEnabled = ProcessInfo.processInfo.environment["RUN_BENCHMARKS"] != nil
10+
11+
@Suite(.serialized)
12+
struct ModelLoadingBenchmarks {
13+
14+
/// Benchmark LLM model loading
15+
/// Tests: parallel tokenizer/weights, single config.json read
16+
@Test(.enabled(if: benchmarksEnabled))
17+
func loadLLM() async throws {
18+
let modelId = "mlx-community/Qwen3-0.6B-4bit"
19+
let hub = HubApi()
20+
let config = ModelConfiguration(id: modelId)
21+
22+
// Warm up: ensure model is downloaded
23+
_ = try await LLMModelFactory.shared.load(hub: hub, configuration: config) { _ in }
24+
25+
// Benchmark multiple runs
26+
let runs = 5
27+
var times: [Double] = []
28+
29+
for i in 1 ... runs {
30+
let start = CFAbsoluteTimeGetCurrent()
31+
32+
let _ = try await LLMModelFactory.shared.load(
33+
hub: hub,
34+
configuration: config
35+
) { _ in }
36+
37+
let elapsed = (CFAbsoluteTimeGetCurrent() - start) * 1000
38+
times.append(elapsed)
39+
print("LLM load run \(i): \(String(format: "%.0f", elapsed))ms")
40+
41+
// Clear GPU cache to ensure independent measurements
42+
GPU.clearCache()
43+
}
44+
45+
let avg = times.reduce(0, +) / Double(times.count)
46+
print("LLM load average: \(String(format: "%.0f", avg))ms")
47+
}
48+
49+
/// Benchmark VLM model loading
50+
/// Tests: parallel tokenizer/weights, single config.json read, parallel processor config
51+
@Test(.enabled(if: benchmarksEnabled))
52+
func loadVLM() async throws {
53+
let modelId = "mlx-community/Qwen2-VL-2B-Instruct-4bit"
54+
let hub = HubApi()
55+
let config = ModelConfiguration(id: modelId)
56+
57+
// Warm up: ensure model is downloaded
58+
_ = try await VLMModelFactory.shared.load(hub: hub, configuration: config) { _ in }
59+
60+
// Benchmark multiple runs
61+
let runs = 5
62+
var times: [Double] = []
63+
64+
for i in 1 ... runs {
65+
let start = CFAbsoluteTimeGetCurrent()
66+
67+
let _ = try await VLMModelFactory.shared.load(
68+
hub: hub,
69+
configuration: config
70+
) { _ in }
71+
72+
let elapsed = (CFAbsoluteTimeGetCurrent() - start) * 1000
73+
times.append(elapsed)
74+
print("VLM load run \(i): \(String(format: "%.0f", elapsed))ms")
75+
76+
// Clear GPU cache to ensure independent measurements
77+
GPU.clearCache()
78+
}
79+
80+
let avg = times.reduce(0, +) / Double(times.count)
81+
print("VLM load average: \(String(format: "%.0f", avg))ms")
82+
}
83+
}

0 commit comments

Comments
 (0)