Fix non-contiguous queries when using Vamana (#343)

jparismorgan · web-flow · commit 23f201ae8b62 · 2024-04-29T17:00:11.000+02:00
diff --git a/apis/python/src/tiledb/vector_search/vamana_index.py b/apis/python/src/tiledb/vector_search/vamana_index.py
@@ -91,8 +91,11 @@ def query_internal(
 
         if queries.ndim == 1:
             queries = np.array([queries])
+        queries = np.transpose(queries)
+        if not queries.flags.f_contiguous:
+            queries = queries.copy(order="F")
+        queries_feature_vector_array = vspy.FeatureVectorArray(queries)
 
-        queries_feature_vector_array = vspy.FeatureVectorArray(np.transpose(queries))
         distances, ids = self.index.query(queries_feature_vector_array, k, opt_l)
 
         return np.array(distances, copy=False), np.array(ids, copy=False)
diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py
@@ -353,6 +353,8 @@ def test_index_with_incorrect_num_of_query_columns_simple(tmp_path):
         queries = load_fvecs(queries_uri)
         index.query(queries, k=10)
 
+        Index.delete_index(uri=index_uri, config={})
+
 
 def test_index_with_incorrect_num_of_query_columns_complex(tmp_path):
     vfs = tiledb.VFS()
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -265,11 +265,6 @@ def test_ingestion_fvec(tmp_path):
     gt_i, gt_d = get_groundtruth_ivec(gt_uri, k=k, nqueries=nqueries)
 
     for index_type, index_class in zip(INDEXES, INDEX_CLASSES):
-        # TODO(paris): Fix Vamana bug and re-enable:
-        # RuntimeError: IndexError: index 100 is out of bounds for axis 0 with size 100
-        if index_type == "VAMANA":
-            continue
-
         index_uri = os.path.join(tmp_path, f"array_{index_type}")
         index = ingest(
             index_type=index_type,
@@ -319,11 +314,6 @@ def test_ingestion_numpy(tmp_path):
     gt_i, gt_d = get_groundtruth_ivec(gt_uri, k=k, nqueries=nqueries)
 
     for index_type, index_class in zip(INDEXES, INDEX_CLASSES):
-        # TODO(paris): Fix Vamana bug and re-enable:
-        # RuntimeError: IndexError: index 100 is out of bounds for axis 0 with size 100
-        if index_type == "VAMANA":
-            continue
-
         index_uri = os.path.join(tmp_path, f"array_{index_type}")
         index = ingest(
             index_type=index_type,
@@ -424,11 +414,6 @@ def test_ingestion_multiple_workers(tmp_path):
     gt_i, gt_d = get_groundtruth_ivec(gt_uri, k=k, nqueries=nqueries)
 
     for index_type, index_class in zip(INDEXES, INDEX_CLASSES):
-        # TODO(paris): Fix Vamana bug and re-enable:
-        # RuntimeError: Invalid key when getting the URI: adjacency_scores_array_name. Name does not exist: adjacency_scores
-        if index_type == "VAMANA":
-            continue
-
         index_uri = os.path.join(tmp_path, f"array_{index_type}")
         index = ingest(
             index_type=index_type,
@@ -485,11 +470,6 @@ def test_ingestion_external_ids_numpy(tmp_path):
     )
 
     for index_type, index_class in zip(INDEXES, INDEX_CLASSES):
-        # # TODO(paris): Fix Vamana bug and re-enable:
-        # # RuntimeError: Invalid key when getting the URI: adjacency_scores_array_name. Name does not exist: adjacency_scores
-        if index_type == "VAMANA":
-            continue
-
         index_uri = os.path.join(tmp_path, f"array_{index_type}")
         index = ingest(
             index_type=index_type,
@@ -530,11 +510,6 @@ def test_ingestion_with_updates(tmp_path):
     gt_i, gt_d = get_groundtruth(dataset_dir, k)
 
     for index_type, index_class in zip(INDEXES, INDEX_CLASSES):
-        # TODO(paris): Fix Vamana bug and re-enable:
-        # RuntimeError: Invalid key when getting the URI: adjacency_scores_array_name. Name does not exist: adjacency_scores
-        if index_type == "VAMANA":
-            continue
-
         index_uri = os.path.join(tmp_path, f"array_{index_type}")
         index = ingest(
             index_type=index_type,
diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py
@@ -4,6 +4,7 @@
 from array_paths import *
 
 from tiledb.vector_search import _tiledbvspy as vspy
+from tiledb.vector_search.utils import load_fvecs
 
 ctx = vspy.Ctx({})
 
@@ -80,6 +81,8 @@ def test_numpy_to_feature_vector_array():
     assert a.shape == (10000, 128)
     assert b.dimension() == 128
     assert b.num_vectors() == 10000
+    assert a.shape == np.array(b).shape
+    assert np.array_equal(a, np.array(b))
 
     a = np.array(np.random.rand(128, 10000), dtype=np.float32, order="F")
     b = vspy.FeatureVectorArray(a)
@@ -88,6 +91,9 @@ def test_numpy_to_feature_vector_array():
     assert a.shape == (128, 10000)
     assert b.dimension() == 128
     assert b.num_vectors() == 10000
+    # TODO(paris): This should work, but it doesn't.
+    # assert a.shape == np.array(b).shape
+    # assert np.array_equal(a, np.array(b))
 
     a = np.array(np.random.rand(10000, 128), dtype=np.float32)
     b = vspy.FeatureVectorArray(a.T)
@@ -96,6 +102,8 @@ def test_numpy_to_feature_vector_array():
     assert a.shape == (10000, 128)
     assert b.dimension() == 128
     assert b.num_vectors() == 10000
+    assert a.shape == np.array(b).shape
+    assert np.array_equal(a, np.array(b))
 
     a = np.array(np.random.rand(1000000, 128), dtype=np.uint8)
     b = vspy.FeatureVectorArray(a)
@@ -104,17 +112,44 @@ def test_numpy_to_feature_vector_array():
     assert a.shape == (1000000, 128)
     assert b.dimension() == 128
     assert b.num_vectors() == 1000000
+    assert a.shape == np.array(b).shape
+    assert np.array_equal(a, np.array(b))
 
     a = np.array(np.random.rand(10000, 128), dtype=np.float32)
     b = vspy.FeatureVectorArray(a)
     logging.info(a.shape)
     logging.info((b.dimension(), b.num_vectors()))
-
-    c = np.array(b)
-    logging.info(c.shape)
-
-    assert a.shape == c.shape
-    assert (a == c).all()
+    assert a.shape == np.array(b).shape
+    assert np.array_equal(a, np.array(b))
+
+    a = np.array(np.arange(1, 16, dtype=np.float32).reshape(3, 5), dtype=np.float32)
+    assert a.shape == (3, 5)
+    assert a.flags.f_contiguous is False
+    assert a.flags.c_contiguous is True
+    a = np.transpose(a)
+    assert a.shape == (5, 3)
+    assert a.flags.f_contiguous is True
+    assert a.flags.c_contiguous is False
+    b = vspy.FeatureVectorArray(a)
+    # NOTE(paris): It is strange that we have to transpose this output array to have it match the input array. Should investigate this and fix it.
+    assert a.shape == np.transpose(np.array(b)).shape
+    assert np.array_equal(a, np.transpose(np.array(b)))
+
+    n = 99
+    a = load_fvecs(siftsmall_query_file)[0:n]
+    assert a.shape == (n, 128)
+    assert a.flags.f_contiguous is False
+    assert a.flags.c_contiguous is False
+    a = np.transpose(a)
+    assert a.shape == (128, n)
+    assert a.flags.f_contiguous is False
+    assert a.flags.c_contiguous is False
+    # NOTE(paris): load_fvecs() returns a view of an array, which is not contiguous, so make it contiguous. Ideally we would handle this in FeatureVectorArray().
+    a = np.asfortranarray(a)
+    b = vspy.FeatureVectorArray(a)
+    # NOTE(paris): It is strange that we have to transpose this output array to have it match the input array. Should investigate this and fix it.
+    assert a.shape == np.transpose(np.array(b)).shape
+    assert np.array_equal(a, np.transpose(np.array(b)))
 
 
 def test_construct_IndexFlatL2():
diff --git a/src/include/detail/linalg/matrix.h b/src/include/detail/linalg/matrix.h
@@ -508,7 +508,8 @@ void debug_slice(
   auto rowsEnd = std::min(dimension(A), static_cast<size_t>(max_size));
   auto colsEnd = std::min(num_vectors(A), static_cast<size_t>(max_size));
 
-  std::cout << "# " << msg << std::endl;
+  std::cout << "# " << msg << " (" << dimension(A) << " rows x "
+            << num_vectors(A) << " cols)" << std::endl;
   for (size_t i = 0; i < rowsEnd; ++i) {
     std::cout << "# ";
     for (size_t j = 0; j < colsEnd; ++j) {
diff --git a/src/include/detail/linalg/matrix_with_ids.h b/src/include/detail/linalg/matrix_with_ids.h
@@ -178,7 +178,8 @@ void debug_slice_with_ids(
   auto rowsEnd = std::min(dimension(A), static_cast<size_t>(max_size));
   auto colsEnd = std::min(num_vectors(A), static_cast<size_t>(max_size));
 
-  std::cout << "# " << msg << std::endl;
+  std::cout << "# " << msg << " (" << num_vectors(A) << " vectors)"
+            << std::endl;
   for (size_t i = 0; i < rowsEnd; ++i) {
     std::cout << "# ";
     for (size_t j = 0; j < colsEnd; ++j) {