Skip to content

Commit 9dc58a6

Browse files
kinda broken but briliant
1 parent f35fc96 commit 9dc58a6

File tree

302 files changed

+50539
-49
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

302 files changed

+50539
-49
lines changed

.github/workflows/benchmark.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
name: GEMM Benchmark
2+
on: [push, pull_request]
3+
jobs:
4+
benchmark:
5+
runs-on: macos-latest
6+
steps:
7+
- uses: actions/checkout@v2
8+
- name: Install Python dependencies
9+
run: pip3 install pandas matplotlib
10+
- name: Build
11+
run: |
12+
cd build/tests/metal/ops
13+
make -j2
14+
- name: Run Benchmark
15+
run: ./build/tests/metal/ops/gemm_multi_device_bench_test
16+
- name: Analyze Results
17+
run: python3 benchmarks/analyze_benchmarks.py
18+
# Optional: Add a step to check for regression vs. a baseline CSV
19+
# - name: Check Regression
20+
# run: python3 benchmarks/check_regression.py benchmarks_ops.csv baseline.csv

AI_History.md

Lines changed: 13069 additions & 0 deletions
Large diffs are not rendered by default.

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,11 @@ if (WITH_TENSOR_PARALLEL AND CUDA_DYNAMIC_LOADING)
377377
endif()
378378

379379
if(BUILD_CLI)
380-
add_subdirectory(cli)
380+
add_subdirectory(tests)
381+
add_subdirectory(tests/metal)
381382
endif()
382383

384+
383385
install(
384386
TARGETS ${PROJECT_NAME} EXPORT ${PROJECT_NAME}Targets
385387
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}

benchmarks/README.md

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# GEMM Benchmark Suite: Apple Silicon Maximum Performance
2+
3+
## Overview
4+
This benchmark suite measures the performance of GEMM (General Matrix Multiply) operations across all available compute backends on Apple Silicon (CPU, Accelerate/BLAS, Metal, Metal Tiled, Metal Performance Shaders, and optionally ANE and FP16). It provides a reproducible framework for comparing performance, logging results, and visualizing speedups.
5+
6+
## Benchmark Modes
7+
- `cpu_naive`: Single-threaded C++ GEMM
8+
- `cpu_accelerate`: Apple Accelerate (BLAS) single-threaded
9+
- `cpu_gcd`: Multi-threaded GEMM using Grand Central Dispatch
10+
- `cpu_accgcd`: Accelerate+GCD (multi-threaded BLAS)
11+
- `metal`: Basic Metal kernel
12+
- `metal_batched`: Multiple Metal command buffers in flight
13+
- `metal_tiled`: Tiled/shared-memory Metal kernel (highly optimized)
14+
- `mps`: Metal Performance Shaders GEMM
15+
- `metal_fp16` (optional): Metal kernel in FP16
16+
- `mps_fp16` (optional): MPS GEMM in FP16
17+
- `ane`: Apple Neural Engine (if available)
18+
- `hybrid`: Batch split across CPU, GPU, and ANE in parallel (see below)
19+
20+
## How to Build and Run
21+
22+
1. **Build**
23+
```sh
24+
cd /Users/christophbackhaus/Documents/GitHub/CTranslate2/build/tests/metal/ops
25+
make -j$(sysctl -n hw.logicalcpu)
26+
```
27+
28+
2. **Run**
29+
```sh
30+
./gemm_multi_device_bench_test
31+
```
32+
Results are logged to `benchmarks_ops.csv`.
33+
34+
3. **Analyze**
35+
Use the provided Python script to generate plots and compare speedups.
36+
37+
## CSV Output Format
38+
```
39+
timestamp,commit,operator,mode,device,size,batch,avg_ms,status
40+
```
41+
For hybrid mode, the CSV includes per-device timing:
42+
```
43+
timestamp,commit,operator,mode,device,size,batch,cpu_ms,gpu_ms,ane_ms,total_ms,status
44+
```
45+
46+
## Plotting and Analysis
47+
Run the provided Python script:
48+
```sh
49+
python3 analyze_benchmarks.py
50+
```
51+
52+
## Device Selection
53+
The harness automatically detects and logs results for all available Metal devices (GPU, ANE, etc.).
54+
55+
## Hybrid Batching Mode (CPU + GPU + ANE)
56+
57+
The `hybrid` mode splits large batches across all available devices (CPU, GPU, ANE) to maximize throughput. Each device processes a portion of the batch in parallel. Timings for each device and the total are logged in the CSV for full transparency.
58+
59+
- **How it works:**
60+
- The batch is divided among CPU, GPU, and ANE based on availability and (optionally) device throughput.
61+
- GEMM runs in parallel on each device using separate threads.
62+
- The CSV logs per-device times (`cpu_ms`, `gpu_ms`, `ane_ms`) and the overall `total_ms`.
63+
- **Interpreting Results:**
64+
- Compare `total_ms` to single-device runs to see the benefit of hybrid parallelism.
65+
- Use the per-device columns to identify bottlenecks and optimize batch splitting.
66+
67+
## Extending
68+
- Add new kernels or modes by editing the harness and kernels.
69+
- Add new matrix sizes or batch sizes as needed.
70+
71+
## Contact
72+
For questions or contributions, open an issue or pull request on GitHub.

benchmarks/analyze_benchmarks.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import pandas as pd
2+
import matplotlib.pyplot as plt
3+
import sys
4+
5+
def plot_absolute_latency(df):
6+
import matplotlib.pyplot as plt
7+
pivot = df.pivot_table(index="variant", columns="size", values="metal_ms", aggfunc="min")
8+
pivot.T.plot(marker='o', figsize=(12,6))
9+
plt.title("Absolute Metal Latency (ms) by Variant and Matrix Size")
10+
plt.ylabel("Latency (ms)")
11+
plt.xlabel("Matrix Size")
12+
plt.grid(True, axis='y')
13+
plt.tight_layout()
14+
plt.savefig("gemm_absolute_latency.png")
15+
plt.show()
16+
17+
def plot_speedup_heatmap(df):
18+
import matplotlib.pyplot as plt
19+
import seaborn as sns
20+
pivot = df.pivot_table(index="variant", columns="size", values="speedup_vs_cpu", aggfunc="max")
21+
plt.figure(figsize=(10,7))
22+
sns.heatmap(pivot, annot=True, fmt=".1f", cmap="YlGnBu")
23+
plt.title("Speedup Heatmap: Metal vs. CPU")
24+
plt.ylabel("Variant")
25+
plt.xlabel("Matrix Size")
26+
plt.tight_layout()
27+
plt.savefig("gemm_speedup_heatmap.png")
28+
plt.show()
29+
30+
def plot_batch_scaling(df):
31+
import matplotlib.pyplot as plt
32+
if "batch" in df.columns:
33+
for variant in df["variant"].unique():
34+
df_v = df[df["variant"]==variant]
35+
if not df_v.empty:
36+
plt.plot(df_v["batch"], df_v["metal_ms"], marker='o', label=variant)
37+
plt.title("Batch Size Scaling (Metal)")
38+
plt.xlabel("Batch Size")
39+
plt.ylabel("Metal Latency (ms)")
40+
plt.legend()
41+
plt.grid(True, axis='y')
42+
plt.tight_layout()
43+
plt.savefig("gemm_batch_scaling.png")
44+
plt.show()
45+
46+
def plot_hybrid_breakdown(df):
47+
import matplotlib.pyplot as plt
48+
# Only plot if hybrid results exist
49+
if "hybrid" in df["variant"].str.lower().values:
50+
df_hybrid = df[df["variant"].str.lower().str.contains("hybrid")]
51+
if not df_hybrid.empty:
52+
for idx, row in df_hybrid.iterrows():
53+
labels = []
54+
values = []
55+
for col in ["cpu_ms", "metal_ms", "ane_ms"]:
56+
if col in row and not pd.isnull(row[col]):
57+
labels.append(col)
58+
values.append(row[col])
59+
plt.figure()
60+
plt.bar(labels, values)
61+
plt.title(f"Hybrid Breakdown (size={row['size']})")
62+
plt.ylabel("Latency (ms)")
63+
plt.tight_layout()
64+
plt.savefig(f"gemm_hybrid_breakdown_{row['size']}.png")
65+
plt.show()
66+
67+
def main():
68+
csv_file = sys.argv[1] if len(sys.argv) > 1 else "benchmarks_ops.csv"
69+
df = pd.read_csv(csv_file)
70+
# Only keep relevant columns that might exist
71+
keep_cols = [col for col in ["variant", "size", "cpu_ms", "metal_ms", "ane_ms", "batch", "speedup"] if col in df.columns]
72+
df = df[keep_cols]
73+
# Compute speedup for each row (if both cpu_ms and metal_ms are available)
74+
if "cpu_ms" in df.columns and "metal_ms" in df.columns:
75+
df = df.dropna(subset=["cpu_ms", "metal_ms"])
76+
df["speedup_vs_cpu"] = df["cpu_ms"] / df["metal_ms"]
77+
# Pivot for plotting: show speedup by variant and size
78+
if "speedup_vs_cpu" in df.columns:
79+
pivot = df.pivot_table(index="variant", columns="size", values="speedup_vs_cpu", aggfunc="max")
80+
pivot.plot(kind="bar", figsize=(12,6))
81+
plt.title("GEMM Speedup: Metal vs. CPU")
82+
plt.ylabel("Speedup (X)")
83+
plt.xlabel("Variant")
84+
plt.grid(True, axis='y')
85+
plt.tight_layout()
86+
plt.savefig("gemm_speedup_vs_cpu.png")
87+
plt.show()
88+
plot_speedup_heatmap(df)
89+
plot_absolute_latency(df)
90+
plot_batch_scaling(df)
91+
plot_hybrid_breakdown(df)
92+
93+
if __name__ == "__main__":
94+
main()

benchmarks/gemm_speedup_vs_cpu.png

23.2 KB
Loading

benchmarks/test_benchmarks.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import os
2+
import subprocess
3+
import pandas as pd
4+
import pytest
5+
6+
def test_benchmark_csv_exists():
7+
# Try default locations
8+
candidates = [
9+
'benchmarks_ops.csv',
10+
'../build/tests/metal/ops/benchmarks_ops.csv',
11+
'../../build/tests/metal/ops/benchmarks_ops.csv',
12+
]
13+
found = False
14+
for path in candidates:
15+
if os.path.exists(path):
16+
found = path
17+
break
18+
assert found, 'benchmarks_ops.csv not found in any expected location.'
19+
return found
20+
21+
def test_benchmark_csv_content():
22+
csv_path = test_benchmark_csv_exists()
23+
df = pd.read_csv(csv_path)
24+
# Check columns
25+
# Match actual columns
26+
assert set(['timestamp','commit','operator','variant','size','cpu_ms','metal_ms','speedup']).issubset(df.columns)
27+
# There should be at least one Metal result
28+
assert (df['variant'].str.lower().str.contains('metal') | df['variant'].str.lower().str.contains('gpu')).any(), 'No Metal or GPU results found.'
29+
# Speedup check: for large matrices, Metal should be faster than CPU
30+
df_large = df[df['size'].str.contains('512') & (df['variant'] == 'metal')]
31+
df_cpu = df[df['size'].str.contains('512') & (df['variant'] == 'cpu_naive')]
32+
if not df_large.empty and not df_cpu.empty:
33+
metal_ms = df_large.iloc[0]['metal_ms']
34+
cpu_ms = df_cpu.iloc[0]['cpu_ms']
35+
assert metal_ms < cpu_ms, f'Metal not faster than CPU for large matrix: {metal_ms} vs {cpu_ms}'
36+
37+
if __name__ == '__main__':
38+
pytest.main([__file__])

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
[build-system]
2-
requires = ["setuptools>=61.0"]
2+
requires = ["setuptools>=61.0", "cmake>=3.12", "pybind11>=2.6.0"]
33
build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "ctranslate2"
77
version = "4.0.0"
88
description = "Fork of ctranslate2 with Metal support"
99
authors = [
10-
{ name = "Your Name", email = "your.email@example.com" }
10+
{ name = "NADOO", email = "info@nadoo.de" }
1111
]
1212
readme = "README.md"
1313
requires-python = ">=3.7"
@@ -16,7 +16,7 @@ keywords = ["ctranslate2", "metal", "gpu"]
1616
classifiers = [
1717
"Programming Language :: Python :: 3",
1818
"License :: OSI Approved :: MIT License",
19-
"Operating System :: OS Independent",
19+
"Operating System :: MacOS :: MacOS X",
2020
]
2121

2222
[project.urls]
-99.2 KB
Binary file not shown.

python/setup.py

Lines changed: 54 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,55 +2,57 @@
22
import platform
33
import subprocess
44
import sys
5+
from pathlib import Path
56

67
import pybind11
78
from setuptools import Extension, setup
89
from setuptools.command.build_ext import build_ext
910

1011
VERSION = "4.5.0" # Fixed version number matching the installed library
1112

12-
include_dirs = [
13-
pybind11.get_include(),
14-
"/usr/local/include", # System-installed CTranslate2 headers
15-
]
16-
library_dirs = ["/usr/local/lib"] # System-installed CTranslate2 library
17-
18-
libraries = ["ctranslate2"]
19-
extra_compile_args = []
20-
extra_link_args = []
21-
22-
if platform.system() == "Darwin":
23-
extra_compile_args += [
24-
"-std=c++17",
25-
"-mmacosx-version-min=10.14",
26-
"-fvisibility=default", # Make all symbols visible by default
27-
"-undefined", "dynamic_lookup", # Allow undefined symbols to be looked up at runtime
28-
]
29-
extra_link_args += [
30-
"-mmacosx-version-min=10.14",
31-
"-Wl,-rpath,/usr/local/lib", # Add rpath to find the library
32-
"-Wl,-dead_strip_dylibs", # Remove unused libraries
33-
"-Wl,-bind_at_load", # Bind all symbols at load time
13+
def build_cpp_lib():
14+
"""Build and install the C++ library."""
15+
if platform.system() != "Darwin":
16+
raise RuntimeError("This package only supports macOS")
17+
18+
# Get the root directory of the project
19+
root_dir = Path(__file__).parent.parent.absolute()
20+
21+
# Run CMake configuration
22+
build_dir = root_dir / "build"
23+
build_dir.mkdir(exist_ok=True)
24+
25+
cmake_args = [
26+
"-DCMAKE_BUILD_TYPE=Release",
27+
"-DWITH_METAL=ON",
28+
"-DWITH_MKL=OFF",
29+
"-DWITH_DNNL=OFF",
30+
"-DWITH_CUDA=OFF",
31+
"-DWITH_CUDNN=OFF",
32+
"-DBUILD_TESTS=OFF",
33+
"-DCMAKE_CXX_FLAGS=-std=c++17",
34+
"-DOpenMP_C_FLAGS=-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include",
35+
"-DOpenMP_CXX_FLAGS=-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include",
36+
"-DOpenMP_C_LIB_NAMES=omp",
37+
"-DOpenMP_CXX_LIB_NAMES=omp",
38+
"-DOpenMP_omp_LIBRARY=/opt/homebrew/opt/libomp/lib/libomp.dylib",
39+
f"-DCMAKE_INSTALL_PREFIX={sys.prefix}"
3440
]
35-
if platform.machine() == "arm64":
36-
os.environ["ARCHFLAGS"] = "-arch arm64"
41+
42+
subprocess.check_call(["cmake", "-S", str(root_dir), "-B", str(build_dir)] + cmake_args)
43+
44+
# Build and install
45+
subprocess.check_call(["cmake", "--build", str(build_dir), "-j", str(os.cpu_count())])
46+
subprocess.check_call(["cmake", "--install", str(build_dir)])
3747

3848
class CustomBuildExt(build_ext):
39-
"""A custom build_ext command to add install_name_tool step."""
49+
"""Custom build command that builds the C++ library first."""
4050
def run(self):
41-
build_ext.run(self)
42-
if platform.system() == "Darwin":
43-
# Fix the library path in the extension
44-
ext_path = self.get_ext_fullpath(self.extensions[0].name)
45-
subprocess.check_call([
46-
"install_name_tool",
47-
"-change",
48-
"@rpath/libctranslate2.4.dylib",
49-
"/usr/local/lib/libctranslate2.4.dylib",
50-
ext_path
51-
])
51+
build_cpp_lib()
52+
super().run()
5253

53-
ctranslate2_module = Extension(
54+
# Define the extension module
55+
ext_module = Extension(
5456
"ctranslate2._ext",
5557
sources=[
5658
os.path.join("cpp", name)
@@ -71,14 +73,22 @@ def run(self):
7173
"whisper.cc", # Added whisper.cc
7274
]
7375
],
74-
include_dirs=include_dirs,
75-
library_dirs=library_dirs,
76-
libraries=libraries,
77-
extra_compile_args=extra_compile_args,
78-
extra_link_args=extra_link_args,
79-
language="c++",
76+
include_dirs=[
77+
pybind11.get_include(),
78+
f"{sys.prefix}/include",
79+
],
80+
library_dirs=[f"{sys.prefix}/lib"],
81+
libraries=["ctranslate2"],
82+
extra_compile_args=["-std=c++17", "-mmacosx-version-min=10.14"],
83+
extra_link_args=[
84+
"-mmacosx-version-min=10.14",
85+
"-Wl,-rpath,@loader_path/../lib"
86+
],
8087
)
8188

89+
if platform.machine() == "arm64":
90+
os.environ["ARCHFLAGS"] = "-arch arm64"
91+
8292
setup(
8393
name="ctranslate2",
8494
version=VERSION,
@@ -117,7 +127,7 @@ def run(self):
117127
"pyyaml>=5.3,<7",
118128
],
119129
packages=["ctranslate2"],
120-
ext_modules=[ctranslate2_module],
130+
ext_modules=[ext_module],
121131
cmdclass={"build_ext": CustomBuildExt},
122132
entry_points={
123133
"console_scripts": [

0 commit comments

Comments
 (0)