NADOOIT
diff --git a/‎.github/workflows/benchmark.yml‎
Lines changed: 20 additions & 0 deletions b/‎.github/workflows/benchmark.yml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎AI_History.md‎
Lines changed: 13069 additions & 0 deletions b/‎AI_History.md‎
Lines changed: 13069 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/README.md‎
Lines changed: 72 additions & 0 deletions b/‎benchmarks/README.md‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎benchmarks/analyze_benchmarks.py‎
Lines changed: 94 additions & 0 deletions b/‎benchmarks/analyze_benchmarks.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎benchmarks/gemm_speedup_vs_cpu.png‎
23.2 KB b/‎benchmarks/gemm_speedup_vs_cpu.png‎
23.2 KB
diff --git a/‎benchmarks/test_benchmarks.py‎
Lines changed: 38 additions & 0 deletions b/‎benchmarks/test_benchmarks.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/ctranslate2/_ext.cpython-312-darwin.so‎
-99.2 KB b/‎python/ctranslate2/_ext.cpython-312-darwin.so‎
-99.2 KB
diff --git a/‎python/setup.py‎
Lines changed: 54 additions & 44 deletions b/‎python/setup.py‎
Lines changed: 54 additions & 44 deletions
@@ -0,0 +1,20 @@
+name: GEMM Benchmark
+on: [push, pull_request]
+jobs:
+  benchmark:
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install Python dependencies
+        run: pip3 install pandas matplotlib
+      - name: Build
+        run: |
+          cd build/tests/metal/ops
+          make -j2
+      - name: Run Benchmark
+        run: ./build/tests/metal/ops/gemm_multi_device_bench_test
+      - name: Analyze Results
+        run: python3 benchmarks/analyze_benchmarks.py
+      # Optional: Add a step to check for regression vs. a baseline CSV
+      # - name: Check Regression
+      #   run: python3 benchmarks/check_regression.py benchmarks_ops.csv baseline.csv
@@ -377,9 +377,11 @@ if (WITH_TENSOR_PARALLEL AND CUDA_DYNAMIC_LOADING)
 endif()
 
 if(BUILD_CLI)
-  add_subdirectory(cli)
+  add_subdirectory(tests)
+add_subdirectory(tests/metal)
 endif()
 
+
 install(
   TARGETS ${PROJECT_NAME} EXPORT ${PROJECT_NAME}Targets
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 
@@ -0,0 +1,72 @@
+# GEMM Benchmark Suite: Apple Silicon Maximum Performance
+
+## Overview
+This benchmark suite measures the performance of GEMM (General Matrix Multiply) operations across all available compute backends on Apple Silicon (CPU, Accelerate/BLAS, Metal, Metal Tiled, Metal Performance Shaders, and optionally ANE and FP16). It provides a reproducible framework for comparing performance, logging results, and visualizing speedups.
+
+## Benchmark Modes
+- `cpu_naive`: Single-threaded C++ GEMM
+- `cpu_accelerate`: Apple Accelerate (BLAS) single-threaded
+- `cpu_gcd`: Multi-threaded GEMM using Grand Central Dispatch
+- `cpu_accgcd`: Accelerate+GCD (multi-threaded BLAS)
+- `metal`: Basic Metal kernel
+- `metal_batched`: Multiple Metal command buffers in flight
+- `metal_tiled`: Tiled/shared-memory Metal kernel (highly optimized)
+- `mps`: Metal Performance Shaders GEMM
+- `metal_fp16` (optional): Metal kernel in FP16
+- `mps_fp16` (optional): MPS GEMM in FP16
+- `ane`: Apple Neural Engine (if available)
+- `hybrid`: Batch split across CPU, GPU, and ANE in parallel (see below)
+
+## How to Build and Run
+
+1. **Build**
+   ```sh
+   cd /Users/christophbackhaus/Documents/GitHub/CTranslate2/build/tests/metal/ops
+   make -j$(sysctl -n hw.logicalcpu)
+   ```
+
+2. **Run**
+   ```sh
+   ./gemm_multi_device_bench_test
+   ```
+   Results are logged to `benchmarks_ops.csv`.
+
+3. **Analyze**
+   Use the provided Python script to generate plots and compare speedups.
+
+## CSV Output Format
+```
+timestamp,commit,operator,mode,device,size,batch,avg_ms,status
+```
+For hybrid mode, the CSV includes per-device timing:
+```
+timestamp,commit,operator,mode,device,size,batch,cpu_ms,gpu_ms,ane_ms,total_ms,status
+```
+
+## Plotting and Analysis
+Run the provided Python script:
+```sh
+python3 analyze_benchmarks.py
+```
+
+## Device Selection
+The harness automatically detects and logs results for all available Metal devices (GPU, ANE, etc.).
+
+## Hybrid Batching Mode (CPU + GPU + ANE)
+
+The `hybrid` mode splits large batches across all available devices (CPU, GPU, ANE) to maximize throughput. Each device processes a portion of the batch in parallel. Timings for each device and the total are logged in the CSV for full transparency.
+
+- **How it works:**
+  - The batch is divided among CPU, GPU, and ANE based on availability and (optionally) device throughput.
+  - GEMM runs in parallel on each device using separate threads.
+  - The CSV logs per-device times (`cpu_ms`, `gpu_ms`, `ane_ms`) and the overall `total_ms`.
+- **Interpreting Results:**
+  - Compare `total_ms` to single-device runs to see the benefit of hybrid parallelism.
+  - Use the per-device columns to identify bottlenecks and optimize batch splitting.
+
+## Extending
+- Add new kernels or modes by editing the harness and kernels.
+- Add new matrix sizes or batch sizes as needed.
+
+## Contact
+For questions or contributions, open an issue or pull request on GitHub.
@@ -0,0 +1,94 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import sys
+
+def plot_absolute_latency(df):
+    import matplotlib.pyplot as plt
+    pivot = df.pivot_table(index="variant", columns="size", values="metal_ms", aggfunc="min")
+    pivot.T.plot(marker='o', figsize=(12,6))
+    plt.title("Absolute Metal Latency (ms) by Variant and Matrix Size")
+    plt.ylabel("Latency (ms)")
+    plt.xlabel("Matrix Size")
+    plt.grid(True, axis='y')
+    plt.tight_layout()
+    plt.savefig("gemm_absolute_latency.png")
+    plt.show()
+
+def plot_speedup_heatmap(df):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    pivot = df.pivot_table(index="variant", columns="size", values="speedup_vs_cpu", aggfunc="max")
+    plt.figure(figsize=(10,7))
+    sns.heatmap(pivot, annot=True, fmt=".1f", cmap="YlGnBu")
+    plt.title("Speedup Heatmap: Metal vs. CPU")
+    plt.ylabel("Variant")
+    plt.xlabel("Matrix Size")
+    plt.tight_layout()
+    plt.savefig("gemm_speedup_heatmap.png")
+    plt.show()
+
+def plot_batch_scaling(df):
+    import matplotlib.pyplot as plt
+    if "batch" in df.columns:
+        for variant in df["variant"].unique():
+            df_v = df[df["variant"]==variant]
+            if not df_v.empty:
+                plt.plot(df_v["batch"], df_v["metal_ms"], marker='o', label=variant)
+        plt.title("Batch Size Scaling (Metal)")
+        plt.xlabel("Batch Size")
+        plt.ylabel("Metal Latency (ms)")
+        plt.legend()
+        plt.grid(True, axis='y')
+        plt.tight_layout()
+        plt.savefig("gemm_batch_scaling.png")
+        plt.show()
+
+def plot_hybrid_breakdown(df):
+    import matplotlib.pyplot as plt
+    # Only plot if hybrid results exist
+    if "hybrid" in df["variant"].str.lower().values:
+        df_hybrid = df[df["variant"].str.lower().str.contains("hybrid")]
+        if not df_hybrid.empty:
+            for idx, row in df_hybrid.iterrows():
+                labels = []
+                values = []
+                for col in ["cpu_ms", "metal_ms", "ane_ms"]:
+                    if col in row and not pd.isnull(row[col]):
+                        labels.append(col)
+                        values.append(row[col])
+                plt.figure()
+                plt.bar(labels, values)
+                plt.title(f"Hybrid Breakdown (size={row['size']})")
+                plt.ylabel("Latency (ms)")
+                plt.tight_layout()
+                plt.savefig(f"gemm_hybrid_breakdown_{row['size']}.png")
+                plt.show()
+
+def main():
+    csv_file = sys.argv[1] if len(sys.argv) > 1 else "benchmarks_ops.csv"
+    df = pd.read_csv(csv_file)
+    # Only keep relevant columns that might exist
+    keep_cols = [col for col in ["variant", "size", "cpu_ms", "metal_ms", "ane_ms", "batch", "speedup"] if col in df.columns]
+    df = df[keep_cols]
+    # Compute speedup for each row (if both cpu_ms and metal_ms are available)
+    if "cpu_ms" in df.columns and "metal_ms" in df.columns:
+        df = df.dropna(subset=["cpu_ms", "metal_ms"])
+        df["speedup_vs_cpu"] = df["cpu_ms"] / df["metal_ms"]
+    # Pivot for plotting: show speedup by variant and size
+    if "speedup_vs_cpu" in df.columns:
+        pivot = df.pivot_table(index="variant", columns="size", values="speedup_vs_cpu", aggfunc="max")
+        pivot.plot(kind="bar", figsize=(12,6))
+        plt.title("GEMM Speedup: Metal vs. CPU")
+        plt.ylabel("Speedup (X)")
+        plt.xlabel("Variant")
+        plt.grid(True, axis='y')
+        plt.tight_layout()
+        plt.savefig("gemm_speedup_vs_cpu.png")
+        plt.show()
+        plot_speedup_heatmap(df)
+    plot_absolute_latency(df)
+    plot_batch_scaling(df)
+    plot_hybrid_breakdown(df)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,38 @@
+import os
+import subprocess
+import pandas as pd
+import pytest
+
+def test_benchmark_csv_exists():
+    # Try default locations
+    candidates = [
+        'benchmarks_ops.csv',
+        '../build/tests/metal/ops/benchmarks_ops.csv',
+        '../../build/tests/metal/ops/benchmarks_ops.csv',
+    ]
+    found = False
+    for path in candidates:
+        if os.path.exists(path):
+            found = path
+            break
+    assert found, 'benchmarks_ops.csv not found in any expected location.'
+    return found
+
+def test_benchmark_csv_content():
+    csv_path = test_benchmark_csv_exists()
+    df = pd.read_csv(csv_path)
+    # Check columns
+    # Match actual columns
+    assert set(['timestamp','commit','operator','variant','size','cpu_ms','metal_ms','speedup']).issubset(df.columns)
+    # There should be at least one Metal result
+    assert (df['variant'].str.lower().str.contains('metal') | df['variant'].str.lower().str.contains('gpu')).any(), 'No Metal or GPU results found.'
+    # Speedup check: for large matrices, Metal should be faster than CPU
+    df_large = df[df['size'].str.contains('512') & (df['variant'] == 'metal')]
+    df_cpu = df[df['size'].str.contains('512') & (df['variant'] == 'cpu_naive')]
+    if not df_large.empty and not df_cpu.empty:
+        metal_ms = df_large.iloc[0]['metal_ms']
+        cpu_ms = df_cpu.iloc[0]['cpu_ms']
+        assert metal_ms < cpu_ms, f'Metal not faster than CPU for large matrix: {metal_ms} vs {cpu_ms}'
+
+if __name__ == '__main__':
+    pytest.main([__file__])
@@ -1,13 +1,13 @@
 [build-system]
-requires = ["setuptools>=61.0"]
+requires = ["setuptools>=61.0", "cmake>=3.12", "pybind11>=2.6.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "ctranslate2"
 version = "4.0.0"
 description = "Fork of ctranslate2 with Metal support"
 authors = [
-    { name = "Your Name", email = "your.email@example.com" }
+    { name = "NADOO", email = "info@nadoo.de" }
 ]
 readme = "README.md"
 requires-python = ">=3.7"
@@ -16,7 +16,7 @@ keywords = ["ctranslate2", "metal", "gpu"]
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
-    "Operating System :: OS Independent",
+    "Operating System :: MacOS :: MacOS X",
 ]
 
 [project.urls]
 
@@ -2,55 +2,57 @@
 import platform
 import subprocess
 import sys
+from pathlib import Path
 
 import pybind11
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
 VERSION = "4.5.0"  # Fixed version number matching the installed library
 
-include_dirs = [
-    pybind11.get_include(),
-    "/usr/local/include",  # System-installed CTranslate2 headers
-]
-library_dirs = ["/usr/local/lib"]  # System-installed CTranslate2 library
-
-libraries = ["ctranslate2"]
-extra_compile_args = []
-extra_link_args = []
-
-if platform.system() == "Darwin":
-    extra_compile_args += [
-        "-std=c++17",
-        "-mmacosx-version-min=10.14",
-        "-fvisibility=default",  # Make all symbols visible by default
-        "-undefined", "dynamic_lookup",  # Allow undefined symbols to be looked up at runtime
-    ]
-    extra_link_args += [
-        "-mmacosx-version-min=10.14",
-        "-Wl,-rpath,/usr/local/lib",  # Add rpath to find the library
-        "-Wl,-dead_strip_dylibs",  # Remove unused libraries
-        "-Wl,-bind_at_load",  # Bind all symbols at load time
+def build_cpp_lib():
+    """Build and install the C++ library."""
+    if platform.system() != "Darwin":
+        raise RuntimeError("This package only supports macOS")
+    
+    # Get the root directory of the project
+    root_dir = Path(__file__).parent.parent.absolute()
+    
+    # Run CMake configuration
+    build_dir = root_dir / "build"
+    build_dir.mkdir(exist_ok=True)
+    
+    cmake_args = [
+        "-DCMAKE_BUILD_TYPE=Release",
+        "-DWITH_METAL=ON",
+        "-DWITH_MKL=OFF",
+        "-DWITH_DNNL=OFF",
+        "-DWITH_CUDA=OFF",
+        "-DWITH_CUDNN=OFF",
+        "-DBUILD_TESTS=OFF",
+        "-DCMAKE_CXX_FLAGS=-std=c++17",
+        "-DOpenMP_C_FLAGS=-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include",
+        "-DOpenMP_CXX_FLAGS=-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include",
+        "-DOpenMP_C_LIB_NAMES=omp",
+        "-DOpenMP_CXX_LIB_NAMES=omp",
+        "-DOpenMP_omp_LIBRARY=/opt/homebrew/opt/libomp/lib/libomp.dylib",
+        f"-DCMAKE_INSTALL_PREFIX={sys.prefix}"
     ]
-    if platform.machine() == "arm64":
-        os.environ["ARCHFLAGS"] = "-arch arm64"
+    
+    subprocess.check_call(["cmake", "-S", str(root_dir), "-B", str(build_dir)] + cmake_args)
+    
+    # Build and install
+    subprocess.check_call(["cmake", "--build", str(build_dir), "-j", str(os.cpu_count())])
+    subprocess.check_call(["cmake", "--install", str(build_dir)])
 
 class CustomBuildExt(build_ext):
-    """A custom build_ext command to add install_name_tool step."""
+    """Custom build command that builds the C++ library first."""
     def run(self):
-        build_ext.run(self)
-        if platform.system() == "Darwin":
-            # Fix the library path in the extension
-            ext_path = self.get_ext_fullpath(self.extensions[0].name)
-            subprocess.check_call([
-                "install_name_tool",
-                "-change",
-                "@rpath/libctranslate2.4.dylib",
-                "/usr/local/lib/libctranslate2.4.dylib",
-                ext_path
-            ])
+        build_cpp_lib()
+        super().run()
 
-ctranslate2_module = Extension(
+# Define the extension module
+ext_module = Extension(
     "ctranslate2._ext",
     sources=[
         os.path.join("cpp", name)
@@ -71,14 +73,22 @@ def run(self):
             "whisper.cc",  # Added whisper.cc
         ]
     ],
-    include_dirs=include_dirs,
-    library_dirs=library_dirs,
-    libraries=libraries,
-    extra_compile_args=extra_compile_args,
-    extra_link_args=extra_link_args,
-    language="c++",
+    include_dirs=[
+        pybind11.get_include(),
+        f"{sys.prefix}/include",
+    ],
+    library_dirs=[f"{sys.prefix}/lib"],
+    libraries=["ctranslate2"],
+    extra_compile_args=["-std=c++17", "-mmacosx-version-min=10.14"],
+    extra_link_args=[
+        "-mmacosx-version-min=10.14",
+        "-Wl,-rpath,@loader_path/../lib"
+    ],
 )
 
+if platform.machine() == "arm64":
+    os.environ["ARCHFLAGS"] = "-arch arm64"
+
 setup(
     name="ctranslate2",
     version=VERSION,
@@ -117,7 +127,7 @@ def run(self):
         "pyyaml>=5.3,<7",
     ],
     packages=["ctranslate2"],
-    ext_modules=[ctranslate2_module],
+    ext_modules=[ext_module],
     cmdclass={"build_ext": CustomBuildExt},
     entry_points={
         "console_scripts": [