Skip to content

Commit 6e97afb

Browse files
committed
update dataset references
1 parent 75dd9b8 commit 6e97afb

File tree

1 file changed

+38
-3
lines changed

1 file changed

+38
-3
lines changed

jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
2222
import io.github.jbellis.jvector.example.util.DataSet;
2323
import io.github.jbellis.jvector.example.util.DataSetLoader;
24-
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
2524
import io.github.jbellis.jvector.example.yaml.MultiConfig;
2625
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
2726

@@ -39,6 +38,43 @@
3938
* for regression testing in the run-bench.yml workflow.
4039
*/
4140
public class AutoBenchYAML {
41+
/**
42+
* Returns a list of all dataset names.
43+
* This replaces the need to load datasets.yml which may not be available in all environments.
44+
*/
45+
private static List<String> getAllDatasetNames() {
46+
List<String> allDatasets = new ArrayList<>();
47+
48+
// neighborhood-watch-100k datasets
49+
// allDatasets.add("ada002-100k");
50+
// allDatasets.add("cohere-english-v3-100k");
51+
// allDatasets.add("openai-v3-small-100k");
52+
// allDatasets.add("gecko-100k");
53+
// allDatasets.add("openai-v3-large-3072-100k");
54+
// allDatasets.add("openai-v3-large-1536-100k");
55+
// allDatasets.add("e5-small-v2-100k");
56+
// allDatasets.add("e5-base-v2-100k");
57+
// allDatasets.add("e5-large-v2-100k");
58+
//
59+
// // neighborhood-watch-1M datasets
60+
// allDatasets.add("ada002-1M");
61+
// allDatasets.add("colbert-1M");
62+
63+
// ann-benchmarks datasets
64+
allDatasets.add("glove-25-angular.hdf5");
65+
allDatasets.add("glove-50-angular.hdf5");
66+
allDatasets.add("lastfm-64-dot.hdf5");
67+
allDatasets.add("glove-100-angular.hdf5");
68+
allDatasets.add("glove-200-angular.hdf5");
69+
allDatasets.add("nytimes-256-angular.hdf5");
70+
allDatasets.add("sift-128-euclidean.hdf5");
71+
// Large files not yet supported:
72+
// allDatasets.add("deep-image-96-angular.hdf5");
73+
// allDatasets.add("gist-960-euclidean.hdf5");
74+
75+
return allDatasets;
76+
}
77+
4278
public static void main(String[] args) throws IOException {
4379
// Check for --output argument (required for this class)
4480
String outputPath = null;
@@ -64,8 +100,7 @@ public static void main(String[] args) throws IOException {
64100
// compile regex and do substring matching using find
65101
var pattern = Pattern.compile(regex);
66102

67-
var datasetCollection = DatasetCollection.load();
68-
var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
103+
var datasetNames = getAllDatasetNames().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
69104

70105
System.out.println("Executing the following datasets: " + datasetNames);
71106
List<BenchResult> results = new ArrayList<>();

0 commit comments

Comments
 (0)