Skip to content

Commit 0a95630

Browse files
authored
Merge pull request #1 from lightonai/feature/cuda-acceleration
Add CUDA GPU acceleration feature
2 parents e0a10f4 + 2e201a3 commit 0a95630

File tree

8 files changed

+1838
-0
lines changed

8 files changed

+1838
-0
lines changed

Cargo.lock

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ openblas-src = { version = "0.10", features = [
2828
], optional = true }
2929
accelerate-src = { version = "0.3", optional = true }
3030

31+
# CUDA dependencies (optional)
32+
cudarc = { version = "0.12", features = ["cuda-version-from-build-system", "cublas"], optional = true }
33+
3134
[dev-dependencies]
3235
criterion = { version = "0.5", features = ["html_reports"] }
3336
approx = "0.5"
@@ -42,11 +45,20 @@ openblas = ["blas-src", "openblas-src", "ndarray/blas"]
4245
# Use Apple Accelerate framework (macOS only, recommended for macOS)
4346
accelerate = ["blas-src", "accelerate-src", "ndarray/blas"]
4447

48+
# CUDA GPU acceleration
49+
# Requires CUDA toolkit installed (nvcc in PATH)
50+
cuda = ["cudarc"]
51+
4552
[[bin]]
4653
name = "compare-kmeans"
4754
path = "src/bin/compare_kmeans.rs"
4855
required-features = ["npy"]
4956

57+
[[bin]]
58+
name = "compare-kmeans-cuda"
59+
path = "src/bin/compare_kmeans_cuda.rs"
60+
required-features = ["npy", "cuda"]
61+
5062
[[bench]]
5163
name = "kmeans_benchmark"
5264
harness = false
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#!/usr/bin/env -S uv run
2+
# /// script
3+
# requires-python = ">=3.10"
4+
# dependencies = [
5+
# "numpy>=1.24",
6+
# ]
7+
# ///
8+
"""
9+
Benchmark comparison between fastkmeans-rs CPU and CUDA implementations.
10+
11+
This script measures and compares execution times for CPU and CUDA implementations
12+
across different dataset sizes.
13+
14+
Usage:
15+
uv run benches/benchmark_comparison_cuda.py
16+
17+
Requirements:
18+
- CUDA toolkit installed (with /usr/local/cuda symlink or CUDA_ROOT set)
19+
- Build CUDA feature: cargo build --release --features cuda,npy
20+
"""
21+
22+
import subprocess
23+
import sys
24+
import tempfile
25+
import time
26+
from pathlib import Path
27+
28+
import numpy as np
29+
30+
31+
def benchmark_rust_cpu(data: np.ndarray, k: int, seed: int, max_iters: int, tol: float) -> tuple[float, np.ndarray]:
32+
"""Benchmark Rust fastkmeans-rs CPU and return (time_seconds, centroids)."""
33+
script_dir = Path(__file__).parent
34+
project_root = script_dir.parent
35+
binary_path = project_root / "target" / "release" / "compare-kmeans"
36+
37+
if not binary_path.exists():
38+
print("Building compare-kmeans binary (CPU)...", file=sys.stderr)
39+
result = subprocess.run(
40+
["cargo", "build", "--release", "--features", "npy", "--bin", "compare-kmeans"],
41+
cwd=project_root,
42+
capture_output=True,
43+
text=True,
44+
)
45+
if result.returncode != 0:
46+
print(f"Build failed:\n{result.stderr}", file=sys.stderr)
47+
return None, None
48+
49+
with tempfile.TemporaryDirectory() as tmpdir:
50+
input_path = Path(tmpdir) / "input.npy"
51+
output_path = Path(tmpdir) / "output.npy"
52+
53+
np.save(input_path, data.astype(np.float32))
54+
55+
result = subprocess.run(
56+
[
57+
str(binary_path),
58+
str(input_path),
59+
str(output_path),
60+
str(k),
61+
str(seed),
62+
str(max_iters),
63+
str(tol),
64+
],
65+
cwd=project_root,
66+
capture_output=True,
67+
text=True,
68+
)
69+
70+
if result.returncode != 0:
71+
print(f"Rust CPU binary failed:\n{result.stderr}", file=sys.stderr)
72+
return None, None
73+
74+
# Parse training time from stdout (format: TRAIN_TIME_MS:123.456)
75+
elapsed = None
76+
for line in result.stdout.split('\n'):
77+
if line.startswith('TRAIN_TIME_MS:'):
78+
elapsed = float(line.split(':')[1]) / 1000.0 # Convert ms to seconds
79+
break
80+
81+
if elapsed is None:
82+
print("Warning: Could not parse training time from Rust CPU output", file=sys.stderr)
83+
elapsed = 0.0
84+
85+
centroids = np.load(output_path)
86+
87+
return elapsed, centroids
88+
89+
90+
def benchmark_rust_cuda(data: np.ndarray, k: int, seed: int, max_iters: int, tol: float) -> tuple[float, np.ndarray]:
91+
"""Benchmark Rust fastkmeans-rs CUDA and return (time_seconds, centroids)."""
92+
script_dir = Path(__file__).parent
93+
project_root = script_dir.parent
94+
binary_path = project_root / "target" / "release" / "compare-kmeans-cuda"
95+
96+
if not binary_path.exists():
97+
print("Building compare-kmeans-cuda binary...", file=sys.stderr)
98+
result = subprocess.run(
99+
["cargo", "build", "--release", "--features", "cuda,npy", "--bin", "compare-kmeans-cuda"],
100+
cwd=project_root,
101+
capture_output=True,
102+
text=True,
103+
)
104+
if result.returncode != 0:
105+
print(f"CUDA build failed (is CUDA toolkit installed?):\n{result.stderr}", file=sys.stderr)
106+
return None, None
107+
108+
with tempfile.TemporaryDirectory() as tmpdir:
109+
input_path = Path(tmpdir) / "input.npy"
110+
output_path = Path(tmpdir) / "output.npy"
111+
112+
np.save(input_path, data.astype(np.float32))
113+
114+
result = subprocess.run(
115+
[
116+
str(binary_path),
117+
str(input_path),
118+
str(output_path),
119+
str(k),
120+
str(seed),
121+
str(max_iters),
122+
str(tol),
123+
],
124+
cwd=project_root,
125+
capture_output=True,
126+
text=True,
127+
)
128+
129+
if result.returncode != 0:
130+
print(f"Rust CUDA binary failed:\n{result.stderr}", file=sys.stderr)
131+
return None, None
132+
133+
# Parse training time from stdout (format: TRAIN_TIME_MS:123.456)
134+
elapsed = None
135+
for line in result.stdout.split('\n'):
136+
if line.startswith('TRAIN_TIME_MS:'):
137+
elapsed = float(line.split(':')[1]) / 1000.0 # Convert ms to seconds
138+
break
139+
140+
if elapsed is None:
141+
print("Warning: Could not parse training time from Rust CUDA output", file=sys.stderr)
142+
elapsed = 0.0
143+
144+
centroids = np.load(output_path)
145+
146+
return elapsed, centroids
147+
148+
149+
def run_benchmark(n_samples: int, n_features: int, k: int, seed: int = 42, max_iters: int = 25, tol: float = 1e-8):
150+
"""Run a single benchmark comparison."""
151+
np.random.seed(seed)
152+
data = np.random.randn(n_samples, n_features).astype(np.float32)
153+
154+
results = {}
155+
156+
# Benchmark Rust CPU
157+
time_rs_cpu, centroids_cpu = benchmark_rust_cpu(data, k, seed, max_iters, tol)
158+
results['rust_cpu'] = time_rs_cpu
159+
160+
# Benchmark Rust CUDA
161+
time_rs_cuda, centroids_cuda = benchmark_rust_cuda(data, k, seed, max_iters, tol)
162+
results['rust_cuda'] = time_rs_cuda
163+
164+
# Check if centroids are similar
165+
if centroids_cpu is not None and centroids_cuda is not None:
166+
max_diff = np.max(np.abs(centroids_cpu - centroids_cuda))
167+
results['centroid_diff'] = max_diff
168+
else:
169+
results['centroid_diff'] = None
170+
171+
return results
172+
173+
174+
def format_time(t):
175+
"""Format time value, handling None."""
176+
if t is None:
177+
return "N/A"
178+
return f"{t:.3f}"
179+
180+
181+
def format_speedup(base, target):
182+
"""Calculate and format speedup."""
183+
if base is None or target is None or target == 0:
184+
return "N/A"
185+
return f"{base / target:.2f}x"
186+
187+
188+
def main():
189+
print("=" * 80)
190+
print("Performance Comparison: Rust fastkmeans-rs CPU vs CUDA")
191+
print("=" * 80)
192+
print("Note: Times measure only training (excluding file I/O overhead)")
193+
print()
194+
195+
# Test configurations: (n_samples, n_features, k, description)
196+
configs = [
197+
(1_000, 64, 10, "Small"),
198+
(5_000, 64, 50, "Medium"),
199+
(10_000, 128, 100, "Large"),
200+
(25_000, 128, 100, "XL"),
201+
(50_000, 128, 256, "XXL"),
202+
(100_000, 128, 512, "Huge"),
203+
]
204+
205+
# Print header
206+
print(f"{'Config':<8} {'Samples':>10} {'Dims':>6} {'k':>5} {'CPU (s)':>10} {'CUDA (s)':>10} {'Speedup':>10}")
207+
print("-" * 80)
208+
209+
all_results = []
210+
for n_samples, n_features, k, desc in configs:
211+
print(f"{desc:<8} {n_samples:>10,} {n_features:>6} {k:>5} ", end="", flush=True)
212+
213+
results = run_benchmark(n_samples, n_features, k)
214+
all_results.append((desc, n_samples, n_features, k, results))
215+
216+
print(f"{format_time(results.get('rust_cpu')):>10} {format_time(results.get('rust_cuda')):>10} ", end="")
217+
218+
# Speedup of CUDA over CPU
219+
speedup = format_speedup(results.get('rust_cpu'), results.get('rust_cuda'))
220+
print(f"{speedup:>10}")
221+
222+
print("-" * 80)
223+
224+
# Summary statistics
225+
print("\n" + "=" * 80)
226+
print("SUMMARY")
227+
print("=" * 80)
228+
229+
# Calculate average speedups
230+
cuda_speedups = []
231+
for _, _, _, _, results in all_results:
232+
if results.get('rust_cpu') and results.get('rust_cuda'):
233+
cuda_speedups.append(results['rust_cpu'] / results['rust_cuda'])
234+
235+
if cuda_speedups:
236+
avg_cuda_speedup = sum(cuda_speedups) / len(cuda_speedups)
237+
max_cuda_speedup = max(cuda_speedups)
238+
min_cuda_speedup = min(cuda_speedups)
239+
240+
# Find which configs had best/worst speedup
241+
best_idx = cuda_speedups.index(max_cuda_speedup)
242+
worst_idx = cuda_speedups.index(min_cuda_speedup)
243+
best_config = all_results[best_idx]
244+
worst_config = all_results[worst_idx]
245+
246+
print(f"\nCUDA vs CPU Speedup:")
247+
print(f" Average: {avg_cuda_speedup:.2f}x")
248+
print(f" Best: {max_cuda_speedup:.2f}x ({best_config[0]}: {best_config[1]:,} samples, k={best_config[3]})")
249+
print(f" Worst: {min_cuda_speedup:.2f}x ({worst_config[0]}: {worst_config[1]:,} samples, k={worst_config[3]})")
250+
251+
# Show centroid differences
252+
print(f"\nCentroid Differences (max absolute):")
253+
for desc, _, _, _, results in all_results:
254+
diff = results.get('centroid_diff')
255+
if diff is not None:
256+
print(f" {desc}: {diff:.6f}")
257+
else:
258+
print(f" {desc}: N/A")
259+
260+
print()
261+
262+
263+
if __name__ == "__main__":
264+
main()

0 commit comments

Comments
 (0)