refactor(benchmarks): improve get_many and similarity_search reporting

atasoglu · atasoglu · commit f97bfba546cd · 2025-10-29T02:57:25.000+03:00
* Split get_many into chunks to handle SQLite variable limits
* Update similarity_search to report total time and count
* Separate similarity_search results in print_results for clarity
diff --git a/benchmarks/operations.py b/benchmarks/operations.py
@@ -28,7 +28,14 @@ def benchmark_add(
 
 def benchmark_get_many(client: SQLiteVecClient, rowids: list[int]) -> dict:
     """Benchmark get_many operations."""
-    elapsed, _ = benchmark_operation(client.get_many, rowids)
+    # Split into chunks to avoid SQLite variable limit (999)
+    chunk_size = 500
+    import time
+
+    start = time.perf_counter()
+    for i in range(0, len(rowids), chunk_size):
+        client.get_many(rowids[i : i + chunk_size])
+    elapsed = time.perf_counter() - start
     return {
         "operation": "get_many",
         "count": len(rowids),
@@ -49,14 +56,16 @@ def benchmark_similarity_search(
         times.append(elapsed)
 
     avg_time = statistics.mean(times)
+    total_time = sum(times)
     return {
         "operation": "similarity_search",
         "top_k": top_k,
-        "iterations": iterations,
+        "count": iterations,
+        "time": total_time,
+        "ops_per_sec": iterations / total_time,
         "avg_time": avg_time,
         "min_time": min(times),
         "max_time": max(times),
-        "searches_per_sec": 1 / avg_time,
     }
 
 
diff --git a/benchmarks/reporter.py b/benchmarks/reporter.py
@@ -9,27 +9,49 @@
 
 def print_results(results: list[dict], table_format: str):
     """Print benchmark results in a formatted table."""
-    table_data = []
-    for result in results:
-        op = result["operation"]
-        if "top_k" in result:
-            op = f"{op} (k={result['top_k']})"
+    # Separate similarity_search results
+    regular_results = [r for r in results if r["operation"] != "similarity_search"]
+    search_results = [r for r in results if r["operation"] == "similarity_search"]
 
-        count = result.get("count", result.get("iterations", "-"))
-        time_val = result.get("time", result.get("avg_time", 0))
-        ops_per_sec = result.get("ops_per_sec", result.get("searches_per_sec", 0))
+    # Get count from first result for header
+    count = regular_results[0].get("count", 0) if regular_results else 0
 
-        table_data.append([op, count, f"{time_val:.4f}", f"{ops_per_sec:.2f}"])
+    # Print CRUD operations table
+    table_data = []
+    for result in regular_results:
+        op = result["operation"]
+        time_val = result.get("time", 0)
+        ops_per_sec = result.get("ops_per_sec", 0)
+        table_data.append([op, f"{time_val:.4f}", f"{ops_per_sec:.2f}"])
 
+    print(f"\nCRUD Operations (n={count:,}):")
     print(
-        "\n"
-        + tabulate(
+        tabulate(
             table_data,
-            headers=["Operation", "Count", "Time (s)", "Ops/sec"],
+            headers=["Operation", "Time (s)", "Ops/sec"],
             tablefmt=table_format,
         )
     )
 
+    # Print similarity search table separately
+    if search_results:
+        iterations = search_results[0].get("count", 0)
+        search_data = []
+        for result in search_results:
+            top_k = result.get("top_k", "-")
+            time_val = result.get("time", 0)
+            ops_per_sec = result.get("ops_per_sec", 0)
+            search_data.append([top_k, f"{time_val:.4f}", f"{ops_per_sec:.2f}"])
+
+        print(f"\nSimilarity Search (iterations={iterations}):")
+        print(
+            tabulate(
+                search_data,
+                headers=["Top-K", "Time (s)", "Searches/sec"],
+                tablefmt=table_format,
+            )
+        )
+
 
 def print_summary(
     all_results: dict[str, dict[int, list[dict]]],
@@ -45,20 +67,18 @@ def print_summary(
         operations = [
             "add",
             "get_many",
-            "similarity_search",
             "update_many",
             "get_all",
             "delete_many",
+            "similarity_search",
         ]
         summary_data = []
         for op in operations:
             row = [op]
             for size in dataset_sizes:
                 matching = [r for r in mode_results[size] if r["operation"] == op]
                 if matching:
-                    ops_per_sec = matching[0].get(
-                        "ops_per_sec", matching[0].get("searches_per_sec", 0)
-                    )
+                    ops_per_sec = matching[0].get("ops_per_sec", 0)
                     row.append(f"{ops_per_sec:,.0f}")
                 else:
                     row.append("N/A")
@@ -91,20 +111,16 @@ def export_to_csv(
                 operations = [
                     "add",
                     "get_many",
-                    "similarity_search",
                     "update_many",
                     "get_all",
                     "delete_many",
+                    "similarity_search",
                 ]
                 for op in operations:
                     matching = [r for r in mode_results[size] if r["operation"] == op]
                     if matching:
-                        ops_per_sec = matching[0].get(
-                            "ops_per_sec", matching[0].get("searches_per_sec", 0)
-                        )
-                        time_val = matching[0].get(
-                            "time", matching[0].get("avg_time", 0)
-                        )
+                        ops_per_sec = matching[0].get("ops_per_sec", 0)
+                        time_val = matching[0].get("time", 0)
                         writer.writerow([op, f"{ops_per_sec:.2f}", f"{time_val:.4f}"])
                     else:
                         writer.writerow([op, "N/A", "N/A"])