Skip to content

Commit 70af9a3

Browse files
committed
added no gt bm
1 parent fc59e8a commit 70af9a3

File tree

3 files changed

+191
-16
lines changed

3 files changed

+191
-16
lines changed

src/VecSim/algorithms/hnsw/hnsw_disk.h

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,6 @@ HNSWDiskIndex<DataType, DistType>::~HNSWDiskIndex() {
509509

510510
// Ensure all memory is properly released
511511
idToMetaData.shrink_to_fit();
512-
labelToIdMap.clear();
513512

514513
// Note: db and cf are not owned by this class, so we don't delete them
515514
// Base class destructor will handle indexCalculator and preprocessors
@@ -1808,11 +1807,6 @@ void HNSWDiskIndex<DataType, DistType>::processDeleteBatch() {
18081807
}
18091808
}
18101809

1811-
// Track which neighbors were originally connected (before repair)
1812-
// Use unordered_set instead of vector<bool> for memory efficiency:
1813-
// - neighbor_neighbors typically has M or M0 elements (16-32)
1814-
// - unordered_set uses ~4-8 bytes per element = 64-256 bytes total
1815-
// - vector<bool> would use curElementCount/8 bytes (125KB for 1M vectors)
18161810
vecsim_stl::unordered_set<idType> original_neighbors_set(this->allocator);
18171811
original_neighbors_set.reserve(neighbor_neighbors.size());
18181812
for (idType nn : neighbor_neighbors) {
@@ -1881,18 +1875,66 @@ void HNSWDiskIndex<DataType, DistType>::processDeleteBatch() {
18811875
if (!already_bidirectional) {
18821876
// Stage a neighbor update to add the bidirectional connection
18831877
// This will be handled by adding neighbor_id to new_neighbor_id's list
1884-
// We need to check if new_neighbor_id's list is full and apply heuristic if needed
1878+
// We need to check if new_neighbor_id's list is full and apply heuristic
1879+
// if needed
18851880

1886-
// For simplicity in the disk implementation, we'll add it if there's space
1887-
// or let the opportunistic repair handle it during future operations
18881881
size_t max_neighbors = (level == 0) ? M0 : M;
18891882
if (new_neighbor_neighbors.size() < max_neighbors) {
1890-
// Add the reverse connection
1883+
// Space available - simply add the reverse connection
18911884
new_neighbor_neighbors.push_back(neighbor_id);
18921885
stageDeleteUpdate(new_neighbor_id, level, new_neighbor_neighbors);
1886+
} else {
1887+
// List is full - apply heuristic to decide if we should replace
1888+
// an existing neighbor with the new repair edge.
1889+
// This maintains bidirectionality which is critical for HNSW
1890+
// recall quality (avoids "trap" nodes that are easy to enter
1891+
// but hard to exit during greedy search).
1892+
1893+
// Build candidate list: existing neighbors + the new repair edge
1894+
candidatesList<DistType> reverse_candidates(this->allocator);
1895+
reverse_candidates.reserve(new_neighbor_neighbors.size() + 1);
1896+
1897+
const void* new_neighbor_data = getDataByInternalId(new_neighbor_id);
1898+
1899+
// Add existing neighbors with their distances
1900+
for (idType nn : new_neighbor_neighbors) {
1901+
if (nn < curElementCount && !isMarkedDeleted(nn)) {
1902+
const void* nn_data = getDataByInternalId(nn);
1903+
DistType dist = this->calcDistance(nn_data, new_neighbor_data);
1904+
reverse_candidates.emplace_back(dist, nn);
1905+
}
1906+
}
1907+
1908+
// Add the repair edge (neighbor_id -> new_neighbor_id's reverse)
1909+
DistType repair_dist = this->calcDistance(neighbor_data, new_neighbor_data);
1910+
reverse_candidates.emplace_back(repair_dist, neighbor_id);
1911+
1912+
// Apply heuristic to select the best neighbors
1913+
vecsim_stl::vector<idType> removed_from_reverse(this->allocator);
1914+
getNeighborsByHeuristic2(reverse_candidates, max_neighbors, removed_from_reverse);
1915+
1916+
// Check if the repair edge was selected
1917+
bool repair_edge_selected = false;
1918+
for (const auto& [dist, id] : reverse_candidates) {
1919+
if (id == neighbor_id) {
1920+
repair_edge_selected = true;
1921+
break;
1922+
}
1923+
}
1924+
1925+
if (repair_edge_selected) {
1926+
// The heuristic chose the repair edge - update the neighbor list
1927+
vecsim_stl::vector<idType> updated_reverse_neighbors(this->allocator);
1928+
updated_reverse_neighbors.reserve(reverse_candidates.size());
1929+
for (const auto& [dist, id] : reverse_candidates) {
1930+
updated_reverse_neighbors.push_back(id);
1931+
}
1932+
stageDeleteUpdate(new_neighbor_id, level, updated_reverse_neighbors);
1933+
}
1934+
// If repair edge was not selected by heuristic, we accept the
1935+
// unidirectional edge - the heuristic determined that the existing
1936+
// neighbors are better for search quality
18931937
}
1894-
// If the list is full, we skip adding the reverse edge
1895-
// This creates a unidirectional edge, which is acceptable in HNSW
18961938
}
18971939
}
18981940
}
@@ -2493,6 +2535,11 @@ vecsim_stl::vector<idType> HNSWDiskIndex<DataType, DistType>::markDelete(labelTy
24932535
if (raw_it != rawVectorsInRAM.end()) {
24942536
rawVectorsInRAM.erase(raw_it);
24952537
}
2538+
2539+
auto disk_it = rawVectorsDiskCache.find(internalId);
2540+
if (disk_it != rawVectorsDiskCache.end()) {
2541+
rawVectorsDiskCache.erase(disk_it);
2542+
}
24962543
this->numMarkedDeleted++;
24972544

24982545
// If this is the entrypoint, we need to replace it

tests/benchmark/bm_common.h

Lines changed: 127 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class BM_VecSimCommon : public BM_VecSimIndex<index_type_t> {
3737
static void TopK_HNSW_DISK(benchmark::State &st);
3838
static void TopK_HNSW_DISK_MarkDeleted(benchmark::State &st);
3939
static void TopK_HNSW_DISK_DeleteLabel(benchmark::State &st);
40+
// Same as DeleteLabel but excludes ground truth vectors from deletion to keep recall stable
41+
static void TopK_HNSW_DISK_DeleteLabel_ProtectGT(benchmark::State &st);
4042
static void TopK_Tiered(benchmark::State &st, unsigned short index_offset = 0);
4143

4244
// Does nothing but returning the index memory.
@@ -327,6 +329,113 @@ void BM_VecSimCommon<index_type_t>::TopK_HNSW_DISK_DeleteLabel(benchmark::State
327329
}
328330
}
329331

332+
// Same as TopK_HNSW_DISK_DeleteLabel but excludes ground truth vectors from deletion.
333+
// This keeps the ground truth stable across different deletion counts for fair recall comparison.
334+
// st.range(0) = ef_runtime
335+
// st.range(1) = k
336+
// st.range(2) = number of vectors to delete
337+
template <typename index_type_t>
338+
void BM_VecSimCommon<index_type_t>::TopK_HNSW_DISK_DeleteLabel_ProtectGT(benchmark::State &st) {
339+
using data_t = typename index_type_t::data_t;
340+
using dist_t = typename index_type_t::dist_t;
341+
342+
size_t iter = 0;
343+
344+
// Build a set of all ground truth vector IDs (to protect from deletion)
345+
std::unordered_set<labelType> gt_labels_set;
346+
for (size_t q = 0; q < N_QUERIES; q++) {
347+
auto gt_results = BM_VecSimIndex<fp32_index_t>::TopKGroundTruth(q, 100);
348+
for (const auto &res : gt_results->results) {
349+
gt_labels_set.insert(res.id);
350+
}
351+
VecSimQueryReply_Free(gt_results);
352+
}
353+
354+
// Reload the index to get a fresh copy without any deleted vectors
355+
std::string folder_path = BM_VecSimGeneral::AttachRootPath(BM_VecSimGeneral::hnsw_index_file);
356+
INDICES[INDEX_HNSW_DISK] = IndexPtr(HNSWDiskFactory::NewIndex(folder_path));
357+
auto hnsw_index = GET_INDEX(INDEX_HNSW_DISK);
358+
auto *disk_index = dynamic_cast<HNSWDiskIndex<data_t, dist_t> *>(hnsw_index);
359+
360+
// Delete vectors using deleteVector, but skip ground truth vectors
361+
std::vector<labelType> deleted_labels;
362+
const size_t num_to_delete = st.range(2);
363+
364+
// Get pseudo-random unique labels, but the same ones for all runs of the benchmark
365+
// Divide N_VECTORS into num_to_delete equal strata and pick one from each
366+
// Skip any labels that are in ground truth
367+
std::mt19937 rng(42); // Fixed seed for determinism
368+
size_t skipped_gt = 0;
369+
for (size_t i = 0; i < num_to_delete && i < N_VECTORS; i++) {
370+
size_t stratum_start = (i * N_VECTORS) / num_to_delete;
371+
size_t stratum_end = ((i + 1) * N_VECTORS) / num_to_delete;
372+
size_t stratum_size = stratum_end - stratum_start;
373+
374+
std::uniform_int_distribution<size_t> dist(0, stratum_size - 1);
375+
labelType label = stratum_start + dist(rng);
376+
377+
// Skip if this label is in ground truth
378+
if (gt_labels_set.find(label) != gt_labels_set.end()) {
379+
skipped_gt++;
380+
continue;
381+
}
382+
deleted_labels.push_back(label);
383+
}
384+
385+
// Measure the time spent on deleteVector calls (includes batch merge every 10 vectors)
386+
auto delete_start = std::chrono::high_resolution_clock::now();
387+
for (const auto &label : deleted_labels) {
388+
disk_index->deleteVector(label);
389+
}
390+
// Force flush any pending deletes to ensure graph is fully repaired
391+
disk_index->flushDeleteBatch();
392+
auto delete_end = std::chrono::high_resolution_clock::now();
393+
double delete_time_ms = std::chrono::duration<double, std::milli>(delete_end - delete_start).count();
394+
395+
size_t total_deleted = deleted_labels.size();
396+
st.counters["num_deleted"] = total_deleted;
397+
st.counters["num_gt_protected"] = skipped_gt;
398+
st.counters["delete_time_ms"] = delete_time_ms;
399+
if (total_deleted > 0) {
400+
st.counters["delete_time_per_vector_ms"] = delete_time_ms / total_deleted;
401+
}
402+
403+
// Get DB statistics before benchmark
404+
auto stats = disk_index->getDBStatistics();
405+
size_t io_bytes_before = 0;
406+
if (stats) {
407+
io_bytes_before = stats->getTickerCount(rocksdb::Tickers::BYTES_COMPRESSED_TO);
408+
}
409+
410+
std::atomic_int correct = 0;
411+
size_t ef = st.range(0);
412+
size_t k = st.range(1);
413+
414+
for (auto _ : st) {
415+
HNSWRuntimeParams hnswRuntimeParams = {.efRuntime = ef};
416+
auto query_params = BM_VecSimGeneral::CreateQueryParams(hnswRuntimeParams);
417+
auto &q = QUERIES[iter % N_QUERIES];
418+
419+
auto hnsw_results = VecSimIndex_TopKQuery(hnsw_index, q.data(), k, &query_params, BY_SCORE);
420+
st.PauseTiming();
421+
422+
// Ground truth is unchanged since we protected all GT vectors from deletion
423+
auto gt_results = BM_VecSimIndex<fp32_index_t>::TopKGroundTruth(iter % N_QUERIES, k);
424+
425+
BM_VecSimGeneral::MeasureRecall(hnsw_results, gt_results, correct);
426+
427+
VecSimQueryReply_Free(hnsw_results);
428+
VecSimQueryReply_Free(gt_results);
429+
st.ResumeTiming();
430+
iter++;
431+
}
432+
st.counters["Recall"] = (float)correct / (float)(k * iter);
433+
if (stats) {
434+
size_t io_bytes_after = stats->getTickerCount(rocksdb::Tickers::BYTES_COMPRESSED_TO);
435+
st.counters["io_bytes_per_query"] = static_cast<double>(io_bytes_after - io_bytes_before) / iter;
436+
}
437+
}
438+
330439
template <typename index_type_t>
331440

332441
void BM_VecSimCommon<index_type_t>::TopK_BF(benchmark::State &st, unsigned short index_offset) {
@@ -438,10 +547,10 @@ void BM_VecSimCommon<index_type_t>::TopK_Tiered(benchmark::State &st, unsigned s
438547
BENCHMARK_REGISTER_F(BM_CLASS, BM_FUNC) \
439548
->Args({10, 10, 1000}) \
440549
->Args({10, 10, 10000}) \
441-
->Args({10, 10, 50000}) \
550+
->Args({10, 10, 25000}) \
442551
->Args({200, 50, 1000}) \
443552
->Args({200, 50, 10000}) \
444-
->Args({200, 50, 50000}) \
553+
->Args({200, 50, 25000}) \
445554
->ArgNames({"ef_runtime", "k", "num_marked_deleted"}) \
446555
->Iterations(10) \
447556
->Unit(benchmark::kMillisecond)
@@ -452,10 +561,24 @@ void BM_VecSimCommon<index_type_t>::TopK_Tiered(benchmark::State &st, unsigned s
452561
BENCHMARK_REGISTER_F(BM_CLASS, BM_FUNC) \
453562
->Args({10, 10, 1000}) \
454563
->Args({10, 10, 10000}) \
455-
->Args({10, 10, 50000}) \
564+
->Args({10, 10, 25000}) \
565+
->Args({200, 50, 1000}) \
566+
->Args({200, 50, 10000}) \
567+
->Args({200, 50, 25000}) \
568+
->ArgNames({"ef_runtime", "k", "num_deleted"}) \
569+
->Iterations(10) \
570+
->Unit(benchmark::kMillisecond)
571+
572+
// {ef_runtime, k, num_deleted}
573+
// Same as DeleteLabel but protects ground truth vectors from deletion for stable recall comparison
574+
#define REGISTER_TopK_HNSW_DISK_DeleteLabel_ProtectGT(BM_CLASS, BM_FUNC) \
575+
BENCHMARK_REGISTER_F(BM_CLASS, BM_FUNC) \
576+
->Args({10, 10, 1000}) \
577+
->Args({10, 10, 10000}) \
578+
->Args({10, 10, 25000}) \
456579
->Args({200, 50, 1000}) \
457580
->Args({200, 50, 10000}) \
458-
->Args({200, 50, 50000}) \
581+
->Args({200, 50, 25000}) \
459582
->ArgNames({"ef_runtime", "k", "num_deleted"}) \
460583
->Iterations(10) \
461584
->Unit(benchmark::kMillisecond)

tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK_DeleteLabel, HNSW
4949
(benchmark::State &st) { TopK_HNSW_DISK_DeleteLabel(st); }
5050
REGISTER_TopK_HNSW_DISK_DeleteLabel(BM_VecSimCommon, BM_FUNC_NAME(TopK_DeleteLabel, HNSWDisk));
5151

52+
// TopK benchmark after deleting vectors (with graph repair), protecting GT vectors for stable recall
53+
BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK_DeleteLabel_ProtectGT, HNSWDisk), fp32_index_t)
54+
(benchmark::State &st) { TopK_HNSW_DISK_DeleteLabel_ProtectGT(st); }
55+
REGISTER_TopK_HNSW_DISK_DeleteLabel_ProtectGT(BM_VecSimCommon, BM_FUNC_NAME(TopK_DeleteLabel_ProtectGT, HNSWDisk));
56+
5257
// Range benchmarks
5358
// BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FUNC_NAME(Range, BF), fp32_index_t)
5459
// (benchmark::State &st) { Range_BF(st); }

0 commit comments

Comments
 (0)