Added image benchmarks

Pringled · Pringled · commit 52c95c51535e · 2026-01-16T13:01:05.000+01:00
diff --git a/Makefile b/Makefile
@@ -16,3 +16,11 @@ fix:
 
 test:
 	uv run pytest --cov=semhash --cov-report=term-missing
+
+benchmark-text:
+	uv run python -m benchmarks.run_text_benchmarks
+
+benchmark-image:
+	uv run python -m benchmarks.run_image_benchmarks
+
+benchmark: benchmark-text benchmark-image
diff --git a/README.md b/README.md
@@ -303,14 +303,28 @@ from semhash import SemHash
 class VisionEncoder:
     """Custom encoder using timm models. Implements the Encoder protocol."""
 
-    def __init__(self, model_name: str = "mobilenetv3_small_100"):
+    def __init__(self, model_name: str = "mobilenetv3_small_100.lamb_in1k"):
         self.model = timm.create_model(model_name, pretrained=True, num_classes=0).eval()
-        self.transform = timm.data.create_transform(**timm.data.resolve_model_data_config(self.model))
+        data_config = timm.data.resolve_model_data_config(self.model)
+        self.transform = timm.data.create_transform(**data_config, is_training=False)
 
-    def encode(self, inputs):
+    def encode(self, inputs, batch_size: int = 128):
         """Encode a batch of PIL images into embeddings."""
+        import numpy as np
+
+        # Convert grayscale to RGB if needed
+        rgb_inputs = [img.convert("RGB") if img.mode != "RGB" else img for img in inputs]
+
+        # Process in batches to avoid memory issues
+        all_embeddings = []
         with torch.no_grad():
-            return self.model(torch.stack([self.transform(img) for img in inputs])).numpy()
+            for i in range(0, len(rgb_inputs), batch_size):
+                batch_inputs = rgb_inputs[i : i + batch_size]
+                batch = torch.stack([self.transform(img) for img in batch_inputs])
+                embeddings = self.model(batch).numpy()
+                all_embeddings.append(embeddings)
+
+        return np.vstack(all_embeddings)
 
 # Load image dataset
 dataset = load_dataset("uoft-cs/cifar10", split="test")
@@ -513,9 +527,22 @@ deduplicated_records = semhash.self_deduplicate().selected
 
 ## Benchmarks
 
-SemHash is extremely fast and scales to large datasets with millions of records. We've benchmarked both single-dataset deduplication and train/test deduplication across a variety of datasets. For example, deduplicating 1.8M records takes only ~83 seconds on CPU.
+SemHash is extremely fast and scales to large datasets with millions of records. We've benchmarked both text and image deduplication across a variety of datasets. For example, deduplicating text 1.8M records takes only ~83 seconds on CPU.
+
+For detailed benchmark results and analysis, see the [benchmarks directory](benchmarks/README.md).
+
+### Running Benchmarks
 
-For detailed benchmark results including performance metrics across 17 datasets, as well as code to reproduce the benchmarks, see the [benchmarks directory](benchmarks/README.md).
+```bash
+# Run text benchmarks
+make benchmark-text
+
+# Run image benchmarks
+make benchmark-image
+
+# Run all benchmarks
+make benchmark
+```
 
 ## License
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,16 +1,19 @@
 # SemHash Benchmarks
 
-This directory contains the benchmarking code and results for SemHash. The benchmarks measure deduplication performance and speed across a variety of datasets.
+This directory contains the benchmarking code and results for SemHash. The benchmarks measure deduplication performance and speed across a variety of text and image datasets.
 
-## Setup
+## Text Benchmarks
 
-All benchmarks were run with the following configuration:
+### Setup
+
+All text benchmarks were run with the following configuration:
 - **CPU-only**: All benchmarks run on CPU (no GPU acceleration)
 - **ANN backend**: Default backend (USearch)
 - **Encoder**: Default encoder ([potion-base-8M](https://huggingface.co/minishlab/potion-base-8M))
 - **Timing**: Includes encoding time, index building time, and deduplication time
+- **Dependencies**: Requires `datasets` package (`pip install datasets`)
 
-## Results
+### Results
 
 ### Train Deduplication Benchmark
 
@@ -60,7 +63,7 @@ This benchmark measures the performance of deduplicating a test dataset against
 | squad_v2             |       130319 |        11873 |                    11863 |       0.08 |                     7.13 |
 | wikitext             |      1801350 |         4358 |                     2139 |      50.92 |                    40.32 |
 
-## Key Findings
+### Key Findings
 
 SemHash is extremely fast and scales to large datasets with millions of records. Some notable findings include:
 
@@ -70,12 +73,77 @@ SemHash is extremely fast and scales to large datasets with millions of records.
   - `student`: 52% of test data overlaps with training data
   - `wikitext`: 51% of test data overlaps with training data
 
-## Running the Benchmarks
+### Running Text Benchmarks
+
+To run the text benchmarks yourself:
+
+```bash
+# Install dependencies
+pip install datasets
+
+# Run benchmarks
+python -m benchmarks.run_text_benchmarks
+# Or using make
+make benchmark-text
+```
+
+## Image Benchmarks
+
+### Setup
+
+All image benchmarks were run with the following configuration:
+- **Device**: Apple Silicon GPU (MPS)
+- **ANN backend**: Default backend (USearch)
+- **Encoder**: MobileNetV3-Small ([mobilenetv3_small_100.lamb_in1k](https://huggingface.co/timm/mobilenetv3_small_100.lamb_in1k))
+- **Batch size**: 128 images per batch
+- **Timing**: Includes encoding time, index building time, and deduplication time
+
+### Results
+
+#### Train Deduplication Benchmark
+
+This benchmark measures the performance of deduplicating within a single training dataset.
+
+| Dataset              |  Original Train Size |  Deduplicated Train Size |  % Removed |   Deduplication Time (s) |
+|----------------------|----------------------|--------------------------|------------|--------------------------|
+| cifar10              |                50000 |                    48274 |       3.45 |                    61.20 |
+| fashion_mnist        |                60000 |                    16714 |      72.14 |                    86.61 |
+
+#### Train/Test Deduplication Benchmark
+
+This benchmark measures the performance of deduplicating a test dataset against a training dataset.
+
+| Dataset              |   Train Size |    Test Size |   Deduplicated Test Size |  % Removed |   Deduplication Time (s) |
+|----------------------|--------------|--------------|--------------------------|------------|--------------------------|
+| cifar10              |        50000 |        10000 |                     9397 |       6.03 |                    67.43 |
+| fashion_mnist        |        60000 |        10000 |                     2052 |      79.48 |                    72.14 |
+
+### Key Findings
 
-To run the benchmarks yourself:
+- **Fashion-MNIST high deduplication**: Fashion-MNIST shows very high duplication rates (72% train, 79% test) due to the simple nature of the dataset (10 clothing categories with similar items)
+- **CIFAR-10 moderate deduplication**: CIFAR-10 shows lower duplication (3.45% train, 6.03% test) as it contains more diverse natural images
+- **Speed**: Image deduplication is fast even for large datasets (60k images in ~87 seconds on MPS)
+
+### Running Image Benchmarks
+
+To run the image benchmarks yourself:
 
 ```bash
-python -m benchmarks.run_benchmarks
+# Install dependencies
+pip install timm torch datasets
+
+# Run benchmarks
+python -m benchmarks.run_image_benchmarks
+# Or using make
+make benchmark-image
 ```
 
-The datasets can be customized by editing `benchmarks/data.py`.
+The image datasets can be customized by editing `benchmarks/data.py` (see `IMAGE_DATASET_DICT`).
+
+## Running All Benchmarks
+
+To run both text and image benchmarks:
+
+```bash
+make benchmark
+```
diff --git a/benchmarks/data.py b/benchmarks/data.py
@@ -12,6 +12,7 @@ class DatasetRecord:
     columns: list[str] | None = None
     split_one: str = "train"
     split_two: str = "test"
+    modality: str = "text"
 
 
 DATASET_DICT: dict[str, DatasetRecord] = {
@@ -41,3 +42,8 @@ class DatasetRecord:
         name="Salesforce/wikitext", text_name="text", label_name="text", sub_directory="wikitext-103-raw-v1"
     ),
 }
+
+IMAGE_DATASET_DICT: dict[str, DatasetRecord] = {
+    "cifar10": DatasetRecord(name="uoft-cs/cifar10", columns=["img"], modality="image"),
+    "fashion_mnist": DatasetRecord(name="fashion_mnist", columns=["image"], modality="image"),
+}
diff --git a/benchmarks/results/image_train_benchmark_results.json b/benchmarks/results/image_train_benchmark_results.json
@@ -0,0 +1,20 @@
+[
+  {
+    "dataset": "cifar10",
+    "original_train_size": 50000,
+    "deduplicated_train_size": 48274,
+    "percent_removed": 3.4519999999999995,
+    "build_time_seconds": 56.00128899999254,
+    "deduplication_time_seconds": 5.201297917010379,
+    "time_seconds": 61.20258691700292
+  },
+  {
+    "dataset": "fashion_mnist",
+    "original_train_size": 60000,
+    "deduplicated_train_size": 16714,
+    "percent_removed": 72.14333333333333,
+    "build_time_seconds": 61.14413262500602,
+    "deduplication_time_seconds": 25.46288070900482,
+    "time_seconds": 86.60701333401084
+  }
+]
diff --git a/benchmarks/results/image_train_test_benchmark_results.json b/benchmarks/results/image_train_test_benchmark_results.json
@@ -0,0 +1,22 @@
+[
+  {
+    "dataset": "cifar10",
+    "train_size": 50000,
+    "test_size": 10000,
+    "deduplicated_test_size": 9397,
+    "percent_removed": 6.030000000000002,
+    "build_time_seconds": 56.00128899999254,
+    "deduplication_time_seconds": 11.428115875009098,
+    "time_seconds": 67.42940487500164
+  },
+  {
+    "dataset": "fashion_mnist",
+    "train_size": 60000,
+    "test_size": 10000,
+    "deduplicated_test_size": 2052,
+    "percent_removed": 79.47999999999999,
+    "build_time_seconds": 61.14413262500602,
+    "deduplication_time_seconds": 10.998616750002839,
+    "time_seconds": 72.14274937500886
+  }
+]
diff --git a/benchmarks/run_image_benchmarks.py b/benchmarks/run_image_benchmarks.py
diff --git a/benchmarks/run_text_benchmarks.py b/benchmarks/run_text_benchmarks.py