|
| 1 | +""" |
| 2 | +IVFPQ Index implementation. |
| 3 | +""" |
| 4 | +import warnings |
| 5 | +from typing import Any, Mapping |
| 6 | + |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +from tiledb.vector_search import _tiledbvspy as vspy |
| 10 | +from tiledb.vector_search import index |
| 11 | +from tiledb.vector_search.module import * |
| 12 | +from tiledb.vector_search.storage_formats import STORAGE_VERSION |
| 13 | +from tiledb.vector_search.storage_formats import storage_formats |
| 14 | +from tiledb.vector_search.storage_formats import validate_storage_version |
| 15 | +from tiledb.vector_search.utils import MAX_FLOAT32 |
| 16 | +from tiledb.vector_search.utils import MAX_UINT64 |
| 17 | +from tiledb.vector_search.utils import to_temporal_policy |
| 18 | + |
| 19 | +INDEX_TYPE = "IVF_PQ" |
| 20 | + |
| 21 | + |
| 22 | +class IVFPQIndex(index.Index): |
| 23 | + """ |
| 24 | + Opens a `IVFPQIndex`. |
| 25 | +
|
| 26 | + Parameters |
| 27 | + ---------- |
| 28 | + uri: str |
| 29 | + URI of the index. |
| 30 | + config: Optional[Mapping[str, Any]] |
| 31 | + TileDB config dictionary. |
| 32 | + timestamp: int or tuple(int) |
| 33 | + If int, open the index at a given timestamp. |
| 34 | + If tuple, open at the given start and end timestamps. |
| 35 | + open_for_remote_query_execution: bool |
| 36 | + If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`. |
| 37 | + If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph. |
| 38 | + """ |
| 39 | + |
| 40 | + def __init__( |
| 41 | + self, |
| 42 | + uri: str, |
| 43 | + config: Optional[Mapping[str, Any]] = None, |
| 44 | + timestamp=None, |
| 45 | + open_for_remote_query_execution: bool = False, |
| 46 | + **kwargs, |
| 47 | + ): |
| 48 | + self.index_open_kwargs = { |
| 49 | + "uri": uri, |
| 50 | + "config": config, |
| 51 | + "timestamp": timestamp, |
| 52 | + } |
| 53 | + self.index_open_kwargs.update(kwargs) |
| 54 | + self.index_type = INDEX_TYPE |
| 55 | + super().__init__( |
| 56 | + uri=uri, |
| 57 | + config=config, |
| 58 | + timestamp=timestamp, |
| 59 | + open_for_remote_query_execution=open_for_remote_query_execution, |
| 60 | + ) |
| 61 | + self.index = vspy.IndexIVFPQ(self.ctx, uri, to_temporal_policy(timestamp)) |
| 62 | + # TODO(paris): This is incorrect - should be fixed when we fix consolidation. |
| 63 | + self.db_uri = self.group[ |
| 64 | + storage_formats[self.storage_version]["PARTS_ARRAY_NAME"] |
| 65 | + ].uri |
| 66 | + self.ids_uri = self.group[ |
| 67 | + storage_formats[self.storage_version]["IDS_ARRAY_NAME"] |
| 68 | + ].uri |
| 69 | + |
| 70 | + schema = tiledb.ArraySchema.load(self.db_uri, ctx=tiledb.Ctx(self.config)) |
| 71 | + self.dimensions = self.index.dimensions() |
| 72 | + |
| 73 | + self.dtype = np.dtype(self.group.meta.get("dtype", None)) |
| 74 | + if self.dtype is None: |
| 75 | + self.dtype = np.dtype(schema.attr("values").dtype) |
| 76 | + else: |
| 77 | + self.dtype = np.dtype(self.dtype) |
| 78 | + |
| 79 | + if self.base_size == -1: |
| 80 | + self.size = schema.domain.dim(1).domain[1] + 1 |
| 81 | + else: |
| 82 | + self.size = self.base_size |
| 83 | + |
| 84 | + def get_dimensions(self): |
| 85 | + """ |
| 86 | + Returns the dimension of the vectors in the index. |
| 87 | + """ |
| 88 | + return self.dimensions |
| 89 | + |
| 90 | + def query_internal( |
| 91 | + self, |
| 92 | + queries: np.ndarray, |
| 93 | + k: int = 10, |
| 94 | + nprobe: Optional[int] = 100, |
| 95 | + **kwargs, |
| 96 | + ): |
| 97 | + """ |
| 98 | + Queries a `IVFPQIndex`. |
| 99 | +
|
| 100 | + Parameters |
| 101 | + ---------- |
| 102 | + queries: np.ndarray |
| 103 | + 2D array of query vectors. This can be used as a batch query interface by passing multiple queries in one call. |
| 104 | + k: int |
| 105 | + Number of results to return per query vector. |
| 106 | + nprobe: int |
| 107 | + Number of partitions to check per query. |
| 108 | + Use this parameter to trade-off accuracy for latency and cost. |
| 109 | + """ |
| 110 | + warnings.warn("The IVF PQ index is not yet supported, please use with caution.") |
| 111 | + if self.size == 0: |
| 112 | + return np.full((queries.shape[0], k), MAX_FLOAT32), np.full( |
| 113 | + (queries.shape[0], k), MAX_UINT64 |
| 114 | + ) |
| 115 | + |
| 116 | + if queries.ndim == 1: |
| 117 | + queries = np.array([queries]) |
| 118 | + queries = np.transpose(queries) |
| 119 | + if not queries.flags.f_contiguous: |
| 120 | + queries = queries.copy(order="F") |
| 121 | + queries_feature_vector_array = vspy.FeatureVectorArray(queries) |
| 122 | + |
| 123 | + distances, ids = self.index.query( |
| 124 | + vspy.QueryType.InfiniteRAM, queries_feature_vector_array, k, nprobe |
| 125 | + ) |
| 126 | + |
| 127 | + return np.array(distances, copy=False), np.array(ids, copy=False) |
| 128 | + |
| 129 | + |
| 130 | +def create( |
| 131 | + uri: str, |
| 132 | + dimensions: int, |
| 133 | + vector_type: np.dtype, |
| 134 | + num_subspaces: int, |
| 135 | + config: Optional[Mapping[str, Any]] = None, |
| 136 | + storage_version: str = STORAGE_VERSION, |
| 137 | + partitions: Optional[int] = None, |
| 138 | + **kwargs, |
| 139 | +) -> IVFPQIndex: |
| 140 | + """ |
| 141 | + Creates an empty IVFPQIndex. |
| 142 | + Parameters |
| 143 | + ---------- |
| 144 | + uri: str |
| 145 | + URI of the index. |
| 146 | + dimensions: int |
| 147 | + Number of dimensions for the vectors to be stored in the index. |
| 148 | + vector_type: np.dtype |
| 149 | + Datatype of vectors. |
| 150 | + Supported values (uint8, int8, float32). |
| 151 | + num_subspaces: int |
| 152 | + Number of subspaces to use in the PQ encoding. We will divide the dimensions into |
| 153 | + num_subspaces parts, and PQ encode each part separately. This means dimensions must |
| 154 | + be divisible by num_subspaces. |
| 155 | + config: Optional[Mapping[str, Any]] |
| 156 | + TileDB config dictionary. |
| 157 | + storage_version: str |
| 158 | + The TileDB vector search storage version to use. |
| 159 | + If not provided, use the latest stable storage version. |
| 160 | + partitions: int |
| 161 | + Number of partitions to load the data with, if not provided, is auto-configured |
| 162 | + based on the dataset size. |
| 163 | + """ |
| 164 | + warnings.warn("The IVF PQ index is not yet supported, please use with caution.") |
| 165 | + validate_storage_version(storage_version) |
| 166 | + ctx = vspy.Ctx(config) |
| 167 | + if num_subspaces <= 0: |
| 168 | + raise ValueError( |
| 169 | + f"Number of num_subspaces ({num_subspaces}) must be greater than 0." |
| 170 | + ) |
| 171 | + if dimensions % num_subspaces != 0: |
| 172 | + raise ValueError( |
| 173 | + f"Number of dimensions ({dimensions}) must be divisible by num_subspaces ({num_subspaces})." |
| 174 | + ) |
| 175 | + index = vspy.IndexIVFPQ( |
| 176 | + feature_type=np.dtype(vector_type).name, |
| 177 | + id_type=np.dtype(np.uint64).name, |
| 178 | + partitioning_index_type=np.dtype(np.uint64).name, |
| 179 | + dimensions=dimensions, |
| 180 | + n_list=partitions if (partitions is not None and partitions is not -1) else 0, |
| 181 | + num_subspaces=num_subspaces, |
| 182 | + ) |
| 183 | + # TODO(paris): Run all of this with a single C++ call. |
| 184 | + empty_vector = vspy.FeatureVectorArray( |
| 185 | + dimensions, 0, np.dtype(vector_type).name, np.dtype(np.uint64).name |
| 186 | + ) |
| 187 | + index.train(empty_vector) |
| 188 | + index.add(empty_vector) |
| 189 | + index.write_index(ctx, uri, vspy.TemporalPolicy(0), storage_version) |
| 190 | + return IVFPQIndex(uri=uri, config=config) |
0 commit comments