Skip to content

Commit b00d9b6

Browse files
committed
vecbench: download dataset as separate files
Update DatasetLoader (used by vecbench and the vecann workload) to download the train, test, and neighbors vector sets as separate files rather than as a single file. Separating the files will allow huge train datasets to be separated into multiple files. For the new files, use the .fbin and .ibin formats: .fbin [num_vectors (uint32), vector_dim (uint32), vector_array (float32)] .ibin [num_vectors (uint32), num_neighbors_per_vector (uint32), neighbor_array (int32)] Epic: CRDB-42943 Release note: None
1 parent 8f28ff9 commit b00d9b6

File tree

8 files changed

+188
-254
lines changed

8 files changed

+188
-254
lines changed

pkg/cmd/vecbench/main.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ var flagDBConnStr = flag.String("db", "postgresql://root@localhost:26257",
9494
// dbpedia-openai-100k-angular (100K vectors, 1536 dims)
9595
// dbpedia-openai-1000k-angular (1M vectors, 1536 dims)
9696
// laion-1m-test-ip (1M vectors, 768 dims)
97+
// coco-t2i-512-angular (113K vectors, 512 dims)
98+
// coco-i2i-512-angular (113K vectors, 512 dims)
9799
//
98100
// After download, the datasets are cached in a local temp directory and a
99101
// vector index is created. The built vector index is also cached in the temp
@@ -442,6 +444,9 @@ func (vb *vectorBench) ensureDataset(ctx context.Context, forSearch bool) {
442444
fmt.Printf(Cyan+"\rDownloaded %s / %s (%.2f%%) in %v "+Reset,
443445
humanizeutil.IBytes(downloaded), humanizeutil.IBytes(total),
444446
(float64(downloaded)/float64(total))*100, elapsed.Truncate(time.Second))
447+
if downloaded >= total {
448+
fmt.Println()
449+
}
445450
},
446451
}
447452

pkg/workload/vecann/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ go_library(
1818
"//pkg/workload/histogram",
1919
"//pkg/workload/workloadimpl",
2020
"@com_github_cockroachdb_errors//:errors",
21-
"@com_github_cockroachdb_errors//oserror",
2221
"@com_github_spf13_pflag//:pflag",
2322
"@com_google_cloud_go_storage//:storage",
2423
],

0 commit comments

Comments
 (0)