Skip to content

Commit 8c30186

Browse files
author
Jonathan Shook
committed
Virtualize and Modularize DataSetLoader logic
- virtualize DataSetLoader - separate and modularize HDF5, MFD loaders and consolidate helper methods - organize dataset types into a package together - update callers, minor cleanups - use Optional instead of checked IOExceptions - add javadoc - organize dataset classes together - remove bench2d and loader
1 parent f967f1c commit 8c30186

28 files changed

+469
-618
lines changed

benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/IndexConstructionWithStaticSetBenchmark.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@
1515
*/
1616
package io.github.jbellis.jvector.bench;
1717

18-
import io.github.jbellis.jvector.example.SiftSmall;
1918
import io.github.jbellis.jvector.example.util.SiftLoader;
2019
import io.github.jbellis.jvector.graph.*;
2120
import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider;
22-
import io.github.jbellis.jvector.util.Bits;
2321
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
2422
import io.github.jbellis.jvector.vector.types.VectorFloat;
2523
import org.openjdk.jmh.annotations.*;

benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/StaticSetVectorsBenchmark.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*/
1616
package io.github.jbellis.jvector.bench;
1717

18-
import io.github.jbellis.jvector.example.SiftSmall;
18+
import io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall;
1919
import io.github.jbellis.jvector.example.util.SiftLoader;
2020
import io.github.jbellis.jvector.graph.*;
2121
import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider;

jvector-examples/pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@
163163
<argument>-classpath</argument>
164164
<classpath/>
165165
<argument>-ea</argument>
166-
<argument>io.github.jbellis.jvector.example.SiftSmall</argument>
166+
<argument>io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall</argument>
167167
</arguments>
168168
</configuration>
169169
</execution>
@@ -212,7 +212,7 @@
212212
<classpath/>
213213
<argument>--add-modules=jdk.incubator.vector</argument>
214214
<argument>-ea</argument>
215-
<argument>io.github.jbellis.jvector.example.SiftSmall</argument>
215+
<argument>io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall</argument>
216216
</arguments>
217217
</configuration>
218218
</execution>
@@ -306,7 +306,7 @@
306306
<argument>--add-modules=jdk.incubator.vector</argument>
307307
<argument>-ea</argument>
308308
<argument>-Djvector.experimental.enable_native_vectorization=true</argument>
309-
<argument>io.github.jbellis.jvector.example.SiftSmall</argument>
309+
<argument>io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall</argument>
310310
</arguments>
311311
</configuration>
312312
</execution>

jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer;
2121
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
2222
import io.github.jbellis.jvector.example.util.CheckpointManager;
23-
import io.github.jbellis.jvector.example.util.DataSet;
24-
import io.github.jbellis.jvector.example.util.DataSetLoader;
23+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
24+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
2525
import io.github.jbellis.jvector.example.yaml.MultiConfig;
2626

2727
import org.slf4j.Logger;
@@ -130,7 +130,9 @@ public static void main(String[] args) throws IOException {
130130

131131
logger.info("Loading dataset: {}", datasetName);
132132
try {
133-
DataSet ds = DataSetLoader.loadDataSet(datasetName);
133+
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
134+
() -> new IllegalStateException("Dataset " + datasetName + " not found")
135+
);
134136
logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());
135137

136138
String normalizedDatasetName = datasetName;

jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818

1919
import io.github.jbellis.jvector.example.util.CompressorParameters;
2020
import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
21-
import io.github.jbellis.jvector.example.util.DataSet;
22-
import io.github.jbellis.jvector.example.util.DataSetLoader;
21+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
22+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
2323
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
2424
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
2525
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
@@ -90,7 +90,9 @@ private static void execute(Pattern pattern, List<Function<DataSet, CompressorPa
9090
System.out.println("Executing the following datasets: " + datasetNames);
9191

9292
for (var datasetName : datasetNames) {
93-
DataSet ds = DataSetLoader.loadDataSet(datasetName);
93+
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
94+
() -> new RuntimeException("Dataset " + datasetName + " not found")
95+
);
9496
Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
9597
}
9698
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java

Lines changed: 0 additions & 65 deletions
This file was deleted.

jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
package io.github.jbellis.jvector.example;
1818

19-
import io.github.jbellis.jvector.example.util.DataSet;
20-
import io.github.jbellis.jvector.example.util.DataSetLoader;
19+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
20+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
2121
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
2222
import io.github.jbellis.jvector.example.yaml.MultiConfig;
2323

@@ -52,13 +52,16 @@ public static void main(String[] args) throws IOException {
5252
if (!datasetNames.isEmpty()) {
5353
System.out.println("Executing the following datasets: " + datasetNames);
5454

55-
for (var datasetName : datasetNames) {
56-
DataSet ds = DataSetLoader.loadDataSet(datasetName);
55+
String hdf5 = ".hdf5";
56+
for (var rawname : datasetNames) {
57+
String datasetName =
58+
rawname.endsWith(hdf5) ? rawname.substring(0, rawname.length() - hdf5.length() -1) : rawname;
59+
// pre-loading and early error phase
60+
DataSets.loadDataSet(datasetName).orElseThrow(
61+
() -> new RuntimeException("Could not load dataset:" + datasetName)
62+
);
5763

58-
if (datasetName.endsWith(".hdf5")) {
59-
datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
60-
}
61-
MultiConfig config = MultiConfig.getDefaultConfig(datasetName);
64+
MultiConfig config = MultiConfig.getDefaultConfig(rawname);
6265
allConfigs.add(config);
6366
}
6467
}
@@ -76,7 +79,9 @@ public static void main(String[] args) throws IOException {
7679
for (var config : allConfigs) {
7780
String datasetName = config.dataset;
7881

79-
DataSet ds = DataSetLoader.loadDataSet(datasetName);
82+
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
83+
() -> new RuntimeException("Could not load dataset:" + datasetName)
84+
);
8085

8186
Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
8287
config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,

jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import io.github.jbellis.jvector.example.benchmarks.ThroughputBenchmark;
2828
import io.github.jbellis.jvector.example.benchmarks.diagnostics.DiagnosticLevel;
2929
import io.github.jbellis.jvector.example.util.CompressorParameters;
30-
import io.github.jbellis.jvector.example.util.DataSet;
30+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
3131
import io.github.jbellis.jvector.example.util.FilteredForkJoinPool;
3232
import io.github.jbellis.jvector.graph.ImmutableGraphIndex;
3333
import io.github.jbellis.jvector.graph.GraphIndexBuilder;

jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616

1717
package io.github.jbellis.jvector.example;
1818

19-
import io.github.jbellis.jvector.example.util.DataSet;
20-
import io.github.jbellis.jvector.example.util.DownloadHelper;
19+
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD;
2120
import io.github.jbellis.jvector.example.yaml.MultiConfig;
2221

2322
import java.io.IOException;
@@ -28,14 +27,10 @@
2827
public class HelloVectorWorld {
2928
public static void main(String[] args) throws IOException {
3029
System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
31-
3230
String datasetName = "ada002-100k";
33-
34-
var mfd = DownloadHelper.maybeDownloadFvecs(datasetName);
35-
DataSet ds = mfd.load();
36-
31+
var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
32+
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
3733
MultiConfig config = MultiConfig.getConfig(datasetName);
38-
3934
Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
4035
config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
4136
config.construction.getFeatureSets(), config.construction.getCompressorParameters(),

jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java renamed to jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSet.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* limitations under the License.
1515
*/
1616

17-
package io.github.jbellis.jvector.example.util;
17+
package io.github.jbellis.jvector.example.benchmarks.datasets;
1818

1919
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
2020
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;

0 commit comments

Comments
 (0)