|
1 | 1 | import numpy as np |
2 | 2 | from common import * |
| 3 | +import pytest |
3 | 4 |
|
4 | 5 | import tiledb.vector_search.index as ind |
5 | 6 | from tiledb.vector_search import flat_index, ivf_flat_index |
6 | 7 | from tiledb.vector_search.index import Index |
| 8 | +from tiledb.vector_search.ingestion import ingest |
| 9 | +from tiledb.vector_search.utils import load_fvecs |
7 | 10 |
|
8 | 11 | def query_and_check(index, queries, k, expected, **kwargs): |
9 | 12 | for _ in range(3): |
@@ -89,3 +92,101 @@ def test_ivf_flat_index(tmp_path): |
89 | 92 |
|
90 | 93 | index = index.consolidate_updates() |
91 | 94 | query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}, nprobe=partitions) |
| 95 | + |
| 96 | +def test_index_with_incorrect_dimensions(tmp_path): |
| 97 | + indexes = [flat_index, ivf_flat_index] |
| 98 | + for index_type in indexes: |
| 99 | + uri = os.path.join(tmp_path, f"array_{index_type.__name__}") |
| 100 | + index = index_type.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8)) |
| 101 | + |
| 102 | + # Wrong number of dimensions will raise a TypeError. |
| 103 | + with pytest.raises(TypeError): |
| 104 | + index.query(np.array(1, dtype=np.float32), k=3) |
| 105 | + with pytest.raises(TypeError): |
| 106 | + index.query(np.array([[[1, 1, 1]]], dtype=np.float32), k=3) |
| 107 | + with pytest.raises(TypeError): |
| 108 | + index.query(np.array([[[[1, 1, 1]]]], dtype=np.float32), k=3) |
| 109 | + |
| 110 | + # Okay otherwise. |
| 111 | + index.query(np.array([1, 1, 1], dtype=np.float32), k=3) |
| 112 | + index.query(np.array([[1, 1, 1]], dtype=np.float32), k=3) |
| 113 | + |
| 114 | +def test_index_with_incorrect_num_of_query_columns_simple(tmp_path): |
| 115 | + siftsmall_uri = "test/data/siftsmall/siftsmall_base.fvecs" |
| 116 | + queries_uri = "test/data/siftsmall/siftsmall_query.fvecs" |
| 117 | + indexes = ["FLAT", "IVF_FLAT"] |
| 118 | + for index_type in indexes: |
| 119 | + index_uri = os.path.join(tmp_path, f"sift10k_flat_{index_type}") |
| 120 | + index = ingest( |
| 121 | + index_type=index_type, |
| 122 | + index_uri=index_uri, |
| 123 | + source_uri=siftsmall_uri, |
| 124 | + source_type = "FVEC", |
| 125 | + ) |
| 126 | + |
| 127 | + # Wrong number of columns will raise a TypeError. |
| 128 | + query_shape = (1, 1) |
| 129 | + with pytest.raises(TypeError): |
| 130 | + index.query(np.random.rand(*query_shape).astype(np.float32), k=10) |
| 131 | + |
| 132 | + # Okay otherwise. |
| 133 | + query_vectors = load_fvecs(queries_uri) |
| 134 | + index.query(query_vectors, k=10) |
| 135 | + |
| 136 | +def test_index_with_incorrect_num_of_query_columns_complex(tmp_path): |
| 137 | + # Tests that we raise a TypeError if the number of columns in the query is not the same as the |
| 138 | + # number of columns in the indexed data. |
| 139 | + size=1000 |
| 140 | + indexes = ["FLAT", "IVF_FLAT"] |
| 141 | + num_columns_in_vector = [1, 2, 3, 4, 5, 10] |
| 142 | + for index_type in indexes: |
| 143 | + for num_columns in num_columns_in_vector: |
| 144 | + index_uri = os.path.join(tmp_path, f"array_{index_type}_{num_columns}") |
| 145 | + dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{num_columns}") |
| 146 | + create_random_dataset_f32_only_data(nb=size, d=num_columns, centers=1, path=dataset_dir) |
| 147 | + index = ingest(index_type=index_type, index_uri=index_uri, source_uri=os.path.join(dataset_dir, "data.f32bin")) |
| 148 | + |
| 149 | + # We have created a dataset with num_columns in each vector. Let's try creating queries |
| 150 | + # with different numbers of columns and confirming incorrect ones will throw. |
| 151 | + for num_columns_for_query in range(1, num_columns + 2): |
| 152 | + query_shape = (1, num_columns_for_query) |
| 153 | + query = np.random.rand(*query_shape).astype(np.float32) |
| 154 | + if query.shape[1] == num_columns: |
| 155 | + index.query(query, k=1) |
| 156 | + else: |
| 157 | + with pytest.raises(TypeError): |
| 158 | + index.query(query, k=1) |
| 159 | + |
| 160 | + # TODO(paris): This will throw with the following error. Fix and re-enable, then remove |
| 161 | + # test_index_with_incorrect_num_of_query_columns_in_single_vector_query: |
| 162 | + # def array_to_matrix(array: np.ndarray): |
| 163 | + # if array.dtype == np.float32: |
| 164 | + # > return pyarray_copyto_matrix_f32(array) |
| 165 | + # E RuntimeError: Number of dimensions must be two |
| 166 | + # Here we test with a query which is just a vector, i.e. [1, 2, 3]. |
| 167 | + # query = query[0] |
| 168 | + # if num_columns_for_query == num_columns: |
| 169 | + # index.query(query, k=1) |
| 170 | + # else: |
| 171 | + # with pytest.raises(TypeError): |
| 172 | + # index.query(query, k=1) |
| 173 | + |
| 174 | +def test_index_with_incorrect_num_of_query_columns_in_single_vector_query(tmp_path): |
| 175 | + # Tests that we raise a TypeError if the number of columns in the query is not the same as the |
| 176 | + # number of columns in the indexed data, specifically for a single vector query. |
| 177 | + # i.e. queries = [1, 2, 3] instead of queries = [[1, 2, 3], [4, 5, 6]]. |
| 178 | + indexes = [flat_index, ivf_flat_index] |
| 179 | + for index_type in indexes: |
| 180 | + uri = os.path.join(tmp_path, f"array_{index_type.__name__}") |
| 181 | + index = index_type.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8)) |
| 182 | + |
| 183 | + # Wrong number of columns will raise a TypeError. |
| 184 | + with pytest.raises(TypeError): |
| 185 | + index.query(np.array([1], dtype=np.float32), k=3) |
| 186 | + with pytest.raises(TypeError): |
| 187 | + index.query(np.array([1, 1], dtype=np.float32), k=3) |
| 188 | + with pytest.raises(TypeError): |
| 189 | + index.query(np.array([1, 1, 1, 1], dtype=np.float32), k=3) |
| 190 | + |
| 191 | + # Okay otherwise. |
| 192 | + index.query(np.array([1, 1, 1], dtype=np.float32), k=3) |
0 commit comments