Validate the queries vector shape (#164)

jparismorgan · web-flow · commit 4cd316fe53c4 · 2023-12-12T17:31:41.000+01:00
diff --git a/apis/python/src/tiledb/vector_search/flat_index.py b/apis/python/src/tiledb/vector_search/flat_index.py
@@ -40,6 +40,7 @@ def __init__(
             + self.index_version
         ].uri
         schema = tiledb.ArraySchema.load(self.db_uri, ctx=tiledb.Ctx(self.config))
+        self.dimensions = schema.shape[0]
         if self.base_size == -1:
             self.size = schema.domain.dim(1).domain[1] + 1
         else:
@@ -74,6 +75,9 @@ def __init__(
                 self._ids = read_vector_u64(
                     self.ctx, self.ids_uri, 0, self.size, self.base_array_timestamp
                 )
+    
+    def get_dimensions(self):
+        return self.dimensions
 
     def query_internal(
         self,
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -126,6 +126,13 @@ def __init__(
         self.thread_executor = futures.ThreadPoolExecutor()
 
     def query(self, queries: np.ndarray, k, **kwargs):
+        if queries.ndim != 1 and queries.ndim != 2:
+            raise TypeError(f"Expected queries to have either 1 or 2 dimensions (i.e. [...] or [[...], [...]]), but it had {queries.ndim} dimensions")
+        
+        query_dimensions = queries.shape[0] if queries.ndim == 1 else queries.shape[1]
+        if query_dimensions != self.get_dimensions():
+            raise TypeError(f"A query in queries has {query_dimensions} dimensions, but the indexed data had {self.dimensions} dimensions")
+
         with tiledb.scope_ctx(ctx_or_config=self.config):
             if not tiledb.array_exists(self.updates_array_uri):
                 if self.query_base_array:
@@ -253,6 +260,9 @@ def read_additions(
             else:
                 return None, None, updated_ids
 
+    def get_dimensions(self):
+        raise NotImplementedError
+
     def query_internal(self, queries: np.ndarray, k, **kwargs):
         raise NotImplementedError
 
diff --git a/apis/python/src/tiledb/vector_search/ivf_flat_index.py b/apis/python/src/tiledb/vector_search/ivf_flat_index.py
@@ -63,9 +63,11 @@ def __init__(
         ].uri
         self.memory_budget = memory_budget
 
+        schema = tiledb.ArraySchema.load(self.db_uri, ctx=tiledb.Ctx(self.config))
+        self.dimensions = schema.shape[0]
+
         self.dtype = self.group.meta.get("dtype", None)
         if self.dtype is None:
-            schema = tiledb.ArraySchema.load(self.db_uri, ctx=tiledb.Ctx(self.config))
             self.dtype = np.dtype(schema.attr("values").dtype)
         else:
             self.dtype = np.dtype(self.dtype)
@@ -120,6 +122,9 @@ def __init__(
                 self.ctx, self.ids_uri, 0, self.size, self.base_array_timestamp
             )
 
+    def get_dimensions(self):
+        return self.dimensions
+
     def query_internal(
         self,
         queries: np.ndarray,
diff --git a/apis/python/test/common.py b/apis/python/test/common.py
@@ -70,8 +70,47 @@ def groundtruth_read(dataset_dir, nqueries=None):
     else:
         return I, D
 
+def create_random_dataset_f32_only_data(nb, d, centers, path):
+    """
+    Creates a random float32 dataset containing just a dataset and then writes it to disk.
+
+    Parameters
+    ----------
+    nb: int
+        Number of points in the dataset
+    d: int
+        Dimension of the dataset
+    nq: int
+        Number of centers
+    path: str
+        Path to write the dataset to
+    """
+    from sklearn.datasets import make_blobs
+
+    os.mkdir(path)
+    X, _ = make_blobs(n_samples=nb, n_features=d, centers=centers, random_state=1)
+
+    with open(os.path.join(path, "data.f32bin"), "wb") as f:
+        np.array([nb, d], dtype="uint32").tofile(f)
+        X.astype("float32").tofile(f)
 
 def create_random_dataset_f32(nb, d, nq, k, path):
+    """
+    Creates a random float32 dataset containing both a dataset and queries against it, and then writes those to disk.
+
+    Parameters
+    ----------
+    nb: int
+        Number of points in the dataset
+    d: int
+        Dimension of the dataset
+    nq: int
+        Number of queries
+    k: int
+        Number of nearest neighbors to return
+    path: str
+        Path to write the dataset to
+    """
     import sklearn.model_selection
     from sklearn.datasets import make_blobs
     from sklearn.neighbors import NearestNeighbors
@@ -104,6 +143,22 @@ def create_random_dataset_f32(nb, d, nq, k, path):
 
 
 def create_random_dataset_u8(nb, d, nq, k, path):
+    """
+    Creates a random uint8 dataset containing both a dataset and queries against it, and then writes those to disk.
+
+    Parameters
+    ----------
+    nb: int
+        Number of points in the dataset
+    d: int
+        Dimension of the dataset
+    nq: int
+        Number of queries
+    k: int
+        Number of nearest neighbors to return
+    path: str
+        Path to write the dataset to
+    """
     import sklearn.model_selection
     from sklearn.datasets import make_blobs
     from sklearn.neighbors import NearestNeighbors
diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py
@@ -1,9 +1,12 @@
 import numpy as np
 from common import *
+import pytest
 
 import tiledb.vector_search.index as ind
 from tiledb.vector_search import flat_index, ivf_flat_index
 from tiledb.vector_search.index import Index
+from tiledb.vector_search.ingestion import ingest
+from tiledb.vector_search.utils import load_fvecs
 
 def query_and_check(index, queries, k, expected, **kwargs):
     for _ in range(3):
@@ -89,3 +92,101 @@ def test_ivf_flat_index(tmp_path):
 
     index = index.consolidate_updates()
     query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}, nprobe=partitions)
+
+def test_index_with_incorrect_dimensions(tmp_path):
+    indexes = [flat_index, ivf_flat_index]
+    for index_type in indexes:
+        uri = os.path.join(tmp_path, f"array_{index_type.__name__}")
+        index = index_type.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8))
+
+        # Wrong number of dimensions will raise a TypeError.
+        with pytest.raises(TypeError):
+            index.query(np.array(1, dtype=np.float32), k=3)
+        with pytest.raises(TypeError):
+            index.query(np.array([[[1, 1, 1]]], dtype=np.float32), k=3)
+        with pytest.raises(TypeError):
+            index.query(np.array([[[[1, 1, 1]]]], dtype=np.float32), k=3)
+
+        # Okay otherwise.
+        index.query(np.array([1, 1, 1], dtype=np.float32), k=3)
+        index.query(np.array([[1, 1, 1]], dtype=np.float32), k=3)
+
+def test_index_with_incorrect_num_of_query_columns_simple(tmp_path):
+    siftsmall_uri = "test/data/siftsmall/siftsmall_base.fvecs"
+    queries_uri = "test/data/siftsmall/siftsmall_query.fvecs"
+    indexes = ["FLAT", "IVF_FLAT"]
+    for index_type in indexes:
+        index_uri = os.path.join(tmp_path, f"sift10k_flat_{index_type}")
+        index = ingest(
+            index_type=index_type,
+            index_uri=index_uri,
+            source_uri=siftsmall_uri,
+            source_type = "FVEC",
+        )
+
+        # Wrong number of columns will raise a TypeError.
+        query_shape = (1, 1)
+        with pytest.raises(TypeError):
+            index.query(np.random.rand(*query_shape).astype(np.float32), k=10)
+
+        # Okay otherwise.
+        query_vectors = load_fvecs(queries_uri)
+        index.query(query_vectors, k=10)
+
+def test_index_with_incorrect_num_of_query_columns_complex(tmp_path):
+    # Tests that we raise a TypeError if the number of columns in the query is not the same as the 
+    # number of columns in the indexed data.
+    size=1000
+    indexes = ["FLAT", "IVF_FLAT"]
+    num_columns_in_vector = [1, 2, 3, 4, 5, 10]
+    for index_type in indexes:
+        for num_columns in num_columns_in_vector:
+            index_uri = os.path.join(tmp_path, f"array_{index_type}_{num_columns}")
+            dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{num_columns}")
+            create_random_dataset_f32_only_data(nb=size, d=num_columns, centers=1, path=dataset_dir)
+            index = ingest(index_type=index_type, index_uri=index_uri, source_uri=os.path.join(dataset_dir, "data.f32bin"))
+
+            # We have created a dataset with num_columns in each vector. Let's try creating queries 
+            # with different numbers of columns and confirming incorrect ones will throw.
+            for num_columns_for_query in range(1, num_columns + 2):
+                query_shape = (1, num_columns_for_query)
+                query = np.random.rand(*query_shape).astype(np.float32)
+                if query.shape[1] == num_columns:
+                    index.query(query, k=1)
+                else:
+                    with pytest.raises(TypeError):
+                        index.query(query, k=1)
+
+                # TODO(paris): This will throw with the following error. Fix and re-enable, then remove 
+                # test_index_with_incorrect_num_of_query_columns_in_single_vector_query:
+                #   def array_to_matrix(array: np.ndarray):
+                #           if array.dtype == np.float32:
+                #   >           return pyarray_copyto_matrix_f32(array)
+                #   E           RuntimeError: Number of dimensions must be two
+                # Here we test with a query which is just a vector, i.e. [1, 2, 3].
+                # query = query[0]
+                # if num_columns_for_query == num_columns:
+                #     index.query(query, k=1)
+                # else:
+                #     with pytest.raises(TypeError):
+                #         index.query(query, k=1)
+
+def test_index_with_incorrect_num_of_query_columns_in_single_vector_query(tmp_path):
+    # Tests that we raise a TypeError if the number of columns in the query is not the same as the 
+    # number of columns in the indexed data, specifically for a single vector query.
+    # i.e. queries = [1, 2, 3]  instead of queries = [[1, 2, 3], [4, 5, 6]].
+    indexes = [flat_index, ivf_flat_index]
+    for index_type in indexes:
+        uri = os.path.join(tmp_path, f"array_{index_type.__name__}")
+        index = index_type.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8))
+
+        # Wrong number of columns will raise a TypeError.
+        with pytest.raises(TypeError):
+            index.query(np.array([1], dtype=np.float32), k=3)
+        with pytest.raises(TypeError):
+            index.query(np.array([1, 1], dtype=np.float32), k=3)
+        with pytest.raises(TypeError):
+            index.query(np.array([1, 1, 1, 1], dtype=np.float32), k=3)
+
+        # Okay otherwise.
+        index.query(np.array([1, 1, 1], dtype=np.float32), k=3)
diff --git a/src/include/detail/linalg/tdb_partitioned_matrix.h b/src/include/detail/linalg/tdb_partitioned_matrix.h
@@ -293,6 +293,12 @@ class tdbPartitionedMatrix : public Matrix<T, LayoutPolicy, I> {
       std::get<1>(col_part_view_) = std::get<0>(col_part_view_);
       for (size_t i = std::get<0>(col_part_view_); i < total_num_parts_; ++i) {
         auto next_part_size = indices_[parts_[i] + 1] - indices_[parts_[i]];
+
+        // Continue if this partition is empty
+        if (next_part_size == 0) {
+          continue;
+        }
+
         if ((std::get<1>(col_view_) + next_part_size) >
             std::get<0>(col_view_) + max_cols_) {
           break;