Skip to content

Commit aceee8d

Browse files
authored
Additional checking of local file I/O (#237)
A test in `unit_vamana_index` was failing in CI but not locally due to an uninitialized `Vector` in `medoid()`. In addition to properly initializing the `Vector`, this PR adds some additional unit testing to `unit_vamana_index`, which was used to diagnose this problem.
1 parent d1cf7f6 commit aceee8d

File tree

3 files changed

+55
-4
lines changed

3 files changed

+55
-4
lines changed

src/include/detail/graph/diskann.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ auto read_diskann_data(const std::string& path) {
4545
uint32_t ndim{0};
4646

4747
std::ifstream binary_file(path, std::ios::binary);
48+
binary_file.exceptions(std::ifstream::failbit);
4849
if (!binary_file.is_open()) {
4950
throw std::runtime_error("Could not open file " + path);
5051
}
@@ -55,6 +56,10 @@ auto read_diskann_data(const std::string& path) {
5556
auto x = ColMajorMatrix<float>(ndim, npoints);
5657

5758
binary_file.read((char*)x.data(), npoints * ndim * sizeof(float));
59+
if ((size_t)binary_file.gcount() != (size_t)npoints * ndim * sizeof(float)) {
60+
throw std::runtime_error("Could not read all data from " + path);
61+
}
62+
5863
binary_file.close();
5964

6065
return x;

src/include/index/vamana_index.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,8 @@ template <class Distance = sum_of_squares_distance>
392392
auto medoid(auto&& P, Distance distance = Distance{}) {
393393
auto n = num_vectors(P);
394394
auto centroid = Vector<float>(P[0].size());
395+
std::fill(begin(centroid), end(centroid), 0.0);
396+
395397
for (size_t j = 0; j < n; ++j) {
396398
auto p = P[j];
397399
for (size_t i = 0; i < p.size(); ++i) {

src/include/test/unit_vamana_index.cc

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,18 +128,62 @@ TEST_CASE("vamana: diskann", "[vamana]") {
128128
auto f = read_diskann_data(diskann_test_data_file);
129129
CHECK(num_vectors(f) == 256);
130130
CHECK(dimension(f) == 128);
131+
CHECK(f.data() != nullptr);
132+
CHECK(!std::equal(
133+
f.data(), f.data() + 256 * 128, std::vector<float>(128 * 256, 0).data()));
134+
135+
CHECK(f.num_rows() == 128);
136+
CHECK(f.num_cols() == 256);
137+
138+
CHECK(sum_of_squares(f[0], f[72]) == 125678);
139+
{
140+
auto n = num_vectors(f);
141+
CHECK(n != 0);
142+
CHECK(n == 256);
143+
CHECK(f[0].size() == 128);
144+
CHECK(dimension(f) == 128);
145+
146+
auto centroid = Vector<float>(f[0].size());
147+
std::fill(begin(centroid), end(centroid), 0.0);
148+
for (size_t j = 0; j < n; ++j) {
149+
auto p = f[j];
150+
for (size_t i = 0; i < p.size(); ++i) {
151+
centroid[i] += p[i];
152+
}
153+
}
154+
float sum = 0.0;
155+
for (size_t i = 0; i < centroid.size(); ++i) {
156+
sum += abs(centroid[i]);
157+
centroid[i] /= (float)num_vectors(f);
158+
}
159+
CHECK(sum > 0);
160+
161+
std::vector<float> tmp{begin(centroid), end(centroid)};
162+
auto min_score = std::numeric_limits<float>::max();
163+
auto med = std::numeric_limits<size_t>::max();
164+
for (size_t i = 0; i < n; ++i) {
165+
auto score = sum_of_squares(f[i], centroid);
166+
if (score < min_score) {
167+
min_score = score;
168+
med = i;
169+
}
170+
}
171+
CHECK(med != std::numeric_limits<size_t>::max());
172+
}
173+
131174
auto med = ::medoid(f);
132175

133176
if (debug) {
134177
std::cout << "med " << med << std::endl;
178+
std::cout << "f[0] - f[72] = " << sum_of_squares(f[0], f[72]) << std::endl;
135179
}
136180

137181
CHECK(med == 72);
138182

139-
if (debug) {
140-
tiledb::Context ctx;
141-
write_matrix(ctx, f, "/tmp/diskann_test_data_file.tdb");
142-
}
183+
// if (debug) {
184+
// tiledb::Context ctx;
185+
// write_matrix(ctx, f, "/tmp/diskann_test_data_file.tdb");
186+
// }
143187
}
144188

145189
TEST_CASE("vamana: small256 build index", "[vamana]") {

0 commit comments

Comments
 (0)