TileDB-Inc
diff --git a/‎apis/python/src/tiledb/vector_search/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎apis/python/src/tiledb/vector_search/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apis/python/src/tiledb/vector_search/index.py‎
Lines changed: 12 additions & 2 deletions b/‎apis/python/src/tiledb/vector_search/index.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎apis/python/src/tiledb/vector_search/ingestion.py‎
Lines changed: 43 additions & 15 deletions b/‎apis/python/src/tiledb/vector_search/ingestion.py‎
Lines changed: 43 additions & 15 deletions
diff --git a/‎apis/python/src/tiledb/vector_search/ivf_pq_index.py‎
Lines changed: 190 additions & 0 deletions b/‎apis/python/src/tiledb/vector_search/ivf_pq_index.py‎
Lines changed: 190 additions & 0 deletions
@@ -6,6 +6,7 @@
 from .index import Index
 from .ingestion import ingest
 from .ivf_flat_index import IVFFlatIndex
+from .ivf_pq_index import IVFPQIndex
 from .module import array_to_matrix
 from .module import ivf_index
 from .module import ivf_index_tdb
@@ -31,6 +32,7 @@
     "FlatIndex",
     "IVFFlatIndex",
     "VamanaIndex",
+    "IVFPQIndex",
     "Mode",
     "load_as_array",
     "load_as_matrix",
 
@@ -11,6 +11,7 @@
 from tiledb.vector_search.utils import MAX_FLOAT32
 from tiledb.vector_search.utils import MAX_UINT64
 from tiledb.vector_search.utils import add_to_group
+from tiledb.vector_search.utils import is_type_erased_index
 
 DATASET_TYPE = "vector_search"
 
@@ -462,6 +463,10 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
         """
         from tiledb.vector_search.ingestion import ingest
 
+        if self.index_type == "IVF_PQ":
+            # TODO(SC-48888): Fix consolidation for IVF_PQ.
+            raise ValueError("IVF_PQ indexes do not support consolidation yet.")
+
         fragments_info = tiledb.array_fragments(
             self.updates_array_uri, ctx=tiledb.Ctx(self.config)
         )
@@ -566,14 +571,19 @@ def clear_history(
                     f"Time traveling is not supported for index storage_version={storage_version}"
                 )
 
-            if index_type == "VAMANA":
+            if is_type_erased_index(index_type):
                 if storage_formats[storage_version]["UPDATES_ARRAY_NAME"] in group:
                     updates_array_uri = group[
                         storage_formats[storage_version]["UPDATES_ARRAY_NAME"]
                     ].uri
                     tiledb.Array.delete_fragments(updates_array_uri, 0, timestamp)
                 ctx = vspy.Ctx(config)
-                vspy.IndexVamana.clear_history(ctx, uri, timestamp)
+                if index_type == "VAMANA":
+                    vspy.IndexVamana.clear_history(ctx, uri, timestamp)
+                elif index_type == "IVF_PQ":
+                    vspy.IndexIVFPQ.clear_history(ctx, uri, timestamp)
+                else:
+                    raise ValueError(f"Unsupported index_type: {index_type}")
                 return
 
             ingestion_timestamps = [
 
@@ -51,6 +51,7 @@ def ingest(
     namespace: Optional[str] = None,
     size: int = -1,
     partitions: int = -1,
+    num_subspaces: int = -1,
     training_sampling_policy: TrainingSamplingPolicy = TrainingSamplingPolicy.FIRST_N,
     copy_centroids_uri: str = None,
     training_sample_size: int = -1,
@@ -87,7 +88,7 @@ def ingest(
     Parameters
     ----------
     index_type: str
-        Type of vector index (FLAT, IVF_FLAT, VAMANA).
+        Type of vector index (FLAT, IVF_FLAT, IVF_PQ, VAMANA).
     index_uri: str
         Vector index URI (stored as TileDB group).
     input_vectors: np.ndarray
@@ -114,7 +115,11 @@ def ingest(
         Number of input vectors, if not provided use the full size of the input dataset.
         If provided, we filter the first vectors from the input source.
     partitions: int
-        Number of partitions to load the data with, if not provided, is auto-configured based on the dataset size.
+        For IVF indexes, the number of partitions to load the data with, if not provided, is auto-configured based on the dataset size.
+    num_subspaces: int
+        For PQ encoded indexes, the number of subspaces to use in the PQ encoding. We will divide the dimensions into
+        num_subspaces parts, and PQ encode each part separately. This means dimensions must
+        be divisible by num_subspaces.
     copy_centroids_uri: str
         TileDB array URI to copy centroids from, if not provided, centroids are build running `k-means`.
     training_sample_size: int
@@ -199,6 +204,7 @@ def ingest(
     from tiledb.cloud.utilities import set_aws_context
     from tiledb.vector_search import flat_index
     from tiledb.vector_search import ivf_flat_index
+    from tiledb.vector_search import ivf_pq_index
     from tiledb.vector_search import vamana_index
     from tiledb.vector_search.storage_formats import storage_formats
 
@@ -1511,7 +1517,8 @@ def ingest_flat(
             parts_array.close()
             ids_array.close()
 
-    def ingest_vamana(
+    def ingest_type_erased(
+        index_type: str,
         index_group_uri: str,
         source_uri: str,
         source_type: str,
@@ -1636,7 +1643,12 @@ def ingest_vamana(
         from tiledb.vector_search import _tiledbvspy as vspy
 
         ctx = vspy.Ctx(config)
-        index = vspy.IndexVamana(ctx, index_group_uri)
+        if index_type == "VAMANA":
+            index = vspy.IndexVamana(ctx, index_group_uri)
+        elif index_type == "IVF_PQ":
+            index = vspy.IndexIVFPQ(ctx, index_group_uri)
+        else:
+            raise ValueError(f"Unsupported index type: {index_type}")
         data = vspy.FeatureVectorArray(
             ctx, parts_array_uri, ids_array_uri, 0, to_temporal_policy(index_timestamp)
         )
@@ -2191,9 +2203,10 @@ def create_ingestion_dag(
                 **kwargs,
             )
             return d
-        elif index_type == "VAMANA":
+        elif is_type_erased_index(index_type):
             ingest_node = submit(
-                ingest_vamana,
+                ingest_type_erased,
+                index_type=index_type,
                 index_group_uri=index_group_uri,
                 source_uri=source_uri,
                 source_type=source_type,
@@ -2572,8 +2585,8 @@ def consolidate_and_vacuum(
 
         logger.debug("Ingesting Vectors into %r", index_group_uri)
         arrays_created = False
-        if index_type == "VAMANA":
-            # If we're using a type-erased index (i.e. Vamana), we create the group in C++.
+        if is_type_erased_index(index_type):
+            # If we're using a type-erased index, we create the group in C++.
             try:
                 # Try opening the group to see if it exists.
                 group = tiledb.Group(index_group_uri, "r")
@@ -2583,13 +2596,26 @@ def consolidate_and_vacuum(
                 # If it does not then we can create it in C++.
                 message = str(err)
                 if "not exist" in message:
-                    vamana_index.create(
-                        uri=index_group_uri,
-                        dimensions=dimensions,
-                        vector_type=vector_type,
-                        config=config,
-                        storage_version=storage_version,
-                    )
+                    if index_type == "VAMANA":
+                        vamana_index.create(
+                            uri=index_group_uri,
+                            dimensions=dimensions,
+                            vector_type=vector_type,
+                            config=config,
+                            storage_version=storage_version,
+                        )
+                    elif index_type == "IVF_PQ":
+                        ivf_pq_index.create(
+                            uri=index_group_uri,
+                            dimensions=dimensions,
+                            vector_type=vector_type,
+                            num_subspaces=num_subspaces,
+                            partitions=partitions,
+                            config=config,
+                            storage_version=storage_version,
+                        )
+                    else:
+                        raise ValueError(f"Unsupported index type {index_type}")
                 else:
                     raise err
         else:
@@ -2860,5 +2886,7 @@ def consolidate_and_vacuum(
             return ivf_flat_index.IVFFlatIndex(
                 uri=index_group_uri, memory_budget=1000000, config=config
             )
+        elif index_type == "IVF_PQ":
+            return ivf_pq_index.IVFPQIndex(uri=index_group_uri, config=config)
         else:
             raise ValueError(f"Not supported index_type {index_type}")
@@ -0,0 +1,190 @@
+"""
+IVFPQ Index implementation.
+"""
+import warnings
+from typing import Any, Mapping
+
+import numpy as np
+
+from tiledb.vector_search import _tiledbvspy as vspy
+from tiledb.vector_search import index
+from tiledb.vector_search.module import *
+from tiledb.vector_search.storage_formats import STORAGE_VERSION
+from tiledb.vector_search.storage_formats import storage_formats
+from tiledb.vector_search.storage_formats import validate_storage_version
+from tiledb.vector_search.utils import MAX_FLOAT32
+from tiledb.vector_search.utils import MAX_UINT64
+from tiledb.vector_search.utils import to_temporal_policy
+
+INDEX_TYPE = "IVF_PQ"
+
+
+class IVFPQIndex(index.Index):
+    """
+    Opens a `IVFPQIndex`.
+
+    Parameters
+    ----------
+    uri: str
+        URI of the index.
+    config: Optional[Mapping[str, Any]]
+        TileDB config dictionary.
+    timestamp: int or tuple(int)
+        If int, open the index at a given timestamp.
+        If tuple, open at the given start and end timestamps.
+    open_for_remote_query_execution: bool
+        If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`.
+        If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph.
+    """
+
+    def __init__(
+        self,
+        uri: str,
+        config: Optional[Mapping[str, Any]] = None,
+        timestamp=None,
+        open_for_remote_query_execution: bool = False,
+        **kwargs,
+    ):
+        self.index_open_kwargs = {
+            "uri": uri,
+            "config": config,
+            "timestamp": timestamp,
+        }
+        self.index_open_kwargs.update(kwargs)
+        self.index_type = INDEX_TYPE
+        super().__init__(
+            uri=uri,
+            config=config,
+            timestamp=timestamp,
+            open_for_remote_query_execution=open_for_remote_query_execution,
+        )
+        self.index = vspy.IndexIVFPQ(self.ctx, uri, to_temporal_policy(timestamp))
+        # TODO(paris): This is incorrect - should be fixed when we fix consolidation.
+        self.db_uri = self.group[
+            storage_formats[self.storage_version]["PARTS_ARRAY_NAME"]
+        ].uri
+        self.ids_uri = self.group[
+            storage_formats[self.storage_version]["IDS_ARRAY_NAME"]
+        ].uri
+
+        schema = tiledb.ArraySchema.load(self.db_uri, ctx=tiledb.Ctx(self.config))
+        self.dimensions = self.index.dimensions()
+
+        self.dtype = np.dtype(self.group.meta.get("dtype", None))
+        if self.dtype is None:
+            self.dtype = np.dtype(schema.attr("values").dtype)
+        else:
+            self.dtype = np.dtype(self.dtype)
+
+        if self.base_size == -1:
+            self.size = schema.domain.dim(1).domain[1] + 1
+        else:
+            self.size = self.base_size
+
+    def get_dimensions(self):
+        """
+        Returns the dimension of the vectors in the index.
+        """
+        return self.dimensions
+
+    def query_internal(
+        self,
+        queries: np.ndarray,
+        k: int = 10,
+        nprobe: Optional[int] = 100,
+        **kwargs,
+    ):
+        """
+        Queries a `IVFPQIndex`.
+
+        Parameters
+        ----------
+        queries: np.ndarray
+            2D array of query vectors. This can be used as a batch query interface by passing multiple queries in one call.
+        k: int
+            Number of results to return per query vector.
+        nprobe: int
+            Number of partitions to check per query.
+            Use this parameter to trade-off accuracy for latency and cost.
+        """
+        warnings.warn("The IVF PQ index is not yet supported, please use with caution.")
+        if self.size == 0:
+            return np.full((queries.shape[0], k), MAX_FLOAT32), np.full(
+                (queries.shape[0], k), MAX_UINT64
+            )
+
+        if queries.ndim == 1:
+            queries = np.array([queries])
+        queries = np.transpose(queries)
+        if not queries.flags.f_contiguous:
+            queries = queries.copy(order="F")
+        queries_feature_vector_array = vspy.FeatureVectorArray(queries)
+
+        distances, ids = self.index.query(
+            vspy.QueryType.InfiniteRAM, queries_feature_vector_array, k, nprobe
+        )
+
+        return np.array(distances, copy=False), np.array(ids, copy=False)
+
+
+def create(
+    uri: str,
+    dimensions: int,
+    vector_type: np.dtype,
+    num_subspaces: int,
+    config: Optional[Mapping[str, Any]] = None,
+    storage_version: str = STORAGE_VERSION,
+    partitions: Optional[int] = None,
+    **kwargs,
+) -> IVFPQIndex:
+    """
+    Creates an empty IVFPQIndex.
+    Parameters
+    ----------
+    uri: str
+        URI of the index.
+    dimensions: int
+        Number of dimensions for the vectors to be stored in the index.
+    vector_type: np.dtype
+        Datatype of vectors.
+        Supported values (uint8, int8, float32).
+    num_subspaces: int
+        Number of subspaces to use in the PQ encoding. We will divide the dimensions into
+        num_subspaces parts, and PQ encode each part separately. This means dimensions must
+        be divisible by num_subspaces.
+    config: Optional[Mapping[str, Any]]
+        TileDB config dictionary.
+    storage_version: str
+        The TileDB vector search storage version to use.
+        If not provided, use the latest stable storage version.
+    partitions: int
+        Number of partitions to load the data with, if not provided, is auto-configured
+        based on the dataset size.
+    """
+    warnings.warn("The IVF PQ index is not yet supported, please use with caution.")
+    validate_storage_version(storage_version)
+    ctx = vspy.Ctx(config)
+    if num_subspaces <= 0:
+        raise ValueError(
+            f"Number of num_subspaces ({num_subspaces}) must be greater than 0."
+        )
+    if dimensions % num_subspaces != 0:
+        raise ValueError(
+            f"Number of dimensions ({dimensions}) must be divisible by num_subspaces ({num_subspaces})."
+        )
+    index = vspy.IndexIVFPQ(
+        feature_type=np.dtype(vector_type).name,
+        id_type=np.dtype(np.uint64).name,
+        partitioning_index_type=np.dtype(np.uint64).name,
+        dimensions=dimensions,
+        n_list=partitions if (partitions is not None and partitions is not -1) else 0,
+        num_subspaces=num_subspaces,
+    )
+    # TODO(paris): Run all of this with a single C++ call.
+    empty_vector = vspy.FeatureVectorArray(
+        dimensions, 0, np.dtype(vector_type).name, np.dtype(np.uint64).name
+    )
+    index.train(empty_vector)
+    index.add(empty_vector)
+    index.write_index(ctx, uri, vspy.TemporalPolicy(0), storage_version)
+    return IVFPQIndex(uri=uri, config=config)