feat: add NumPy array support for vector representations (#586)

lemorage · web-flow · commit d935c17650e4 · 2025-06-07T22:56:03.000-07:00
* chore: update example code to use NDArray for vectors

* feat: add numpy array support for vector representations

* feat: refactor vector decoding with DtypeRegistry for NumPy

* test: add unit tests for numpy array support

* Update docs to reflect NDArray support with NumPy dtypes

* feat: add NDArray support for `Vector[T]` with `list[T]` fallback, optimize pgvector queries

* feat: update engine value encoding to return ndarray directly
diff --git a/docs/docs/core/data_types.mdx b/docs/docs/core/data_types.mdx
@@ -36,16 +36,17 @@ This is the list of all basic types supported by CocoIndex:
 | LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` |
 | OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` |
 | TimeDelta | A duration of time | `datetime.timedelta` | `datetime.timedelta` |
-| Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` | 
 | Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package | 
+| Vector[*T*, *Dim*?] | *T* can be a basic type or a numeric type. *Dim* is a positive integer and optional. | `cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `numpy.typing.NDArray[T]` or `list[T]` |
 
 Values of all data types can be represented by values in Python's native types (as described under the Native Python Type column).
 However, the underlying execution engine and some storage system (like Postgres) has finer distinctions for some types, specifically:
 
 *   *Float32* and *Float64* for `float`, with different precision.
 *   *LocalDateTime* and *OffsetDateTime* for `datetime.datetime`, with different timezone awareness.
-*   *Vector* has optional dimension information.
 *   *Range* and *Json* provide a clear tag for the type, to clearly distinguish the type in CocoIndex.
+*   *Vector* holds elements of type *T*. If *T* is numeric (e.g., `np.float32` or `np.float64`), it's represented as `NDArray[T]`; otherwise, as `list[T]`.
+*   *Vector* also has optional dimension information.
 
 The native Python type is always more permissive and can represent a superset of possible values.
 *   Only when you annotate the return type of a custom function, you should use the specific type,
diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md
@@ -154,11 +154,11 @@ The goal of transforming your data is usually to query against it.
 Once you already have your index built, you can directly access the transformed data in the target database.
 CocoIndex also provides utilities for you to do this more seamlessly.
 
-In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) to connect to the database and run queries.
-Please make sure it's installed:
+In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) along with pgvector to connect to the database and run queries on vector data.
+Please make sure the required packages are installed:
 
 ```bash
-pip install psycopg[binary,pool]
+pip install numpy psycopg[binary,pool] pgvector
 ```
 
 ### Step 4.1: Extract common transformations
@@ -169,8 +169,11 @@ i.e. they should use exactly the same embedding model and parameters.
 Let's extract that into a function:
 
 ```python title="quickstart.py"
+from numpy.typing import NDArray
+import numpy as np
+
 @cocoindex.transform_flow()
-def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
+def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]:
     return text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
             model="sentence-transformers/all-MiniLM-L6-v2"))
@@ -207,6 +210,7 @@ Now we can create a function to query the index upon a given input query:
 
 ```python title="quickstart.py"
 from psycopg_pool import ConnectionPool
+from pgvector.psycopg import register_vector
 
 def search(pool: ConnectionPool, query: str, top_k: int = 5):
     # Get the table name, for the export target in the text_embedding_flow above.
@@ -215,9 +219,10 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
     query_vector = text_to_embedding.eval(query)
     # Run the query and get the results.
     with pool.connection() as conn:
+        register_vector(conn)
         with conn.cursor() as cur:
             cur.execute(f"""
-                SELECT filename, text, embedding <=> %s::vector AS distance
+                SELECT filename, text, embedding <=> %s AS distance
                 FROM {table_name} ORDER BY distance LIMIT %s
             """, (query_vector, top_k))
             return [
@@ -236,7 +241,7 @@ There're two CocoIndex-specific logic:
 
 2.  Evaluate the transform flow defined above with the input query, to get the embedding.
     It's done by the `eval()` method of the transform flow `text_to_embedding`.
-    The return type of this method is `list[float]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[list[float]]`).
+    The return type of this method is `NDArray[np.float32]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[NDArray[np.float32]]`).
 
 ### Step 4.3: Add the main script logic
 
diff --git a/docs/docs/query.mdx b/docs/docs/query.mdx
@@ -41,7 +41,7 @@ The [quickstart](getting_started/quickstart#step-41-extract-common-transformatio
 
 ```python
 @cocoindex.transform_flow()
-def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
+def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]:
     return text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
             model="sentence-transformers/all-MiniLM-L6-v2"))
@@ -61,7 +61,7 @@ with doc["chunks"].row() as chunk:
     chunk["embedding"] = chunk["text"].call(text_to_embedding)
 ```
 
-Any time, you can call the `eval()` method with specific string, which will return a `list[float]`:
+Any time, you can call the `eval()` method with specific string, which will return a `NDArray[np.float32]`:
 
 ```python
 print(text_to_embedding.eval("Hello, world!"))
@@ -93,7 +93,7 @@ For example:
 
 ```python
 table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
-query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s::vector DESC LIMIT 5"
+query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s DESC LIMIT 5"
 ...
 ```
 
diff --git a/examples/text_embedding/Text_Embedding.ipynb b/examples/text_embedding/Text_Embedding.ipynb
@@ -45,7 +45,7 @@
    },
    "outputs": [],
    "source": [
-    "%pip install cocoindex python-dotenv psycopg[binary,pool]"
+    "%pip install cocoindex numpy python-dotenv psycopg[binary,pool] pgvector"
    ]
   },
   {
@@ -164,7 +164,10 @@
     "from dotenv import load_dotenv\n",
     "import os\n",
     "from psycopg_pool import ConnectionPool\n",
-    "import cocoindex\n"
+    "from pgvector.psycopg import register_vector\n",
+    "import cocoindex\n",
+    "from numpy.typing import NDArray\n",
+    "import numpy as np\n"
    ]
   },
   {
@@ -187,7 +190,7 @@
     "%%writefile -a main.py\n",
     "\n",
     "@cocoindex.transform_flow()\n",
-    "def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:\n",
+    "def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]:\n",
     "    \"\"\"\n",
     "    Embed the text using a SentenceTransformer model.\n",
     "    This is shared logic between indexing and querying.\n",
@@ -274,9 +277,10 @@
     "    query_vector = text_to_embedding.eval(query)\n",
     "    # Run the query and get the results.\n",
     "    with pool.connection() as conn:\n",
+    "        register_vector(conn)\n",
     "        with conn.cursor() as cur:\n",
     "            cur.execute(f\"\"\"\n",
-    "                SELECT filename, text, embedding <=> %s::vector AS distance\n",
+    "                SELECT filename, text, embedding <=> %s AS distance\n",
     "                FROM {table_name} ORDER BY distance LIMIT %s\n",
     "            \"\"\", (query_vector, top_k))\n",
     "            return [\n",
diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py
@@ -1,13 +1,16 @@
 from dotenv import load_dotenv
 from psycopg_pool import ConnectionPool
+from pgvector.psycopg import register_vector
 import cocoindex
 import os
+from numpy.typing import NDArray
+import numpy as np
 
 
 @cocoindex.transform_flow()
 def text_to_embedding(
     text: cocoindex.DataSlice[str],
-) -> cocoindex.DataSlice[list[float]]:
+) -> cocoindex.DataSlice[NDArray[np.float32]]:
     """
     Embed the text using a SentenceTransformer model.
     This is a shared logic between indexing and querying, so extract it as a function.
@@ -71,10 +74,11 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
     query_vector = text_to_embedding.eval(query)
     # Run the query and get the results.
     with pool.connection() as conn:
+        register_vector(conn)
         with conn.cursor() as cur:
             cur.execute(
                 f"""
-                SELECT filename, text, embedding <=> %s::vector AS distance
+                SELECT filename, text, embedding <=> %s AS distance
                 FROM {table_name} ORDER BY distance LIMIT %s
             """,
                 (query_vector, top_k),
diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml
@@ -6,7 +6,9 @@ requires-python = ">=3.10"
 dependencies = [
     "cocoindex>=0.1.42",
     "python-dotenv>=1.0.1",
+    "pgvector>=0.4.1",
     "psycopg[binary,pool]",
+    "numpy",
 ]
 
 [tool.setuptools]
diff --git a/python/cocoindex/convert.py b/python/cocoindex/convert.py
@@ -6,6 +6,7 @@
 import datetime
 import inspect
 import uuid
+import numpy as np
 
 from enum import Enum
 from typing import Any, Callable, get_origin, Mapping
@@ -15,6 +16,7 @@
     is_namedtuple_type,
     TABLE_TYPES,
     KEY_FIELD_NAME,
+    DtypeRegistry,
 )
 
 
@@ -27,6 +29,8 @@ def encode_engine_value(value: Any) -> Any:
         ]
     if is_namedtuple_type(type(value)):
         return [encode_engine_value(getattr(value, name)) for name in value._fields]
+    if isinstance(value, np.ndarray):
+        return value
     if isinstance(value, (list, tuple)):
         return [encode_engine_value(v) for v in value]
     if isinstance(value, dict):
@@ -122,6 +126,38 @@ def decode(value: Any) -> Any | None:
     if src_type_kind == "Uuid":
         return lambda value: uuid.UUID(bytes=value)
 
+    if src_type_kind == "Vector":
+        elem_coco_type_info = analyze_type_info(dst_type_info.elem_type)
+        dtype_info = DtypeRegistry.get_by_kind(elem_coco_type_info.kind)
+
+        def decode_vector(value: Any) -> Any | None:
+            if value is None:
+                if dst_type_info.nullable:
+                    return None
+                raise ValueError(
+                    f"Received null for non-nullable vector `{''.join(field_path)}`"
+                )
+
+            if not isinstance(value, (np.ndarray, list)):
+                raise TypeError(
+                    f"Expected NDArray or list for vector `{''.join(field_path)}`, got {type(value)}"
+                )
+            expected_dim = (
+                dst_type_info.vector_info.dim if dst_type_info.vector_info else None
+            )
+            if expected_dim is not None and len(value) != expected_dim:
+                raise ValueError(
+                    f"Vector dimension mismatch for `{''.join(field_path)}`: "
+                    f"expected {expected_dim}, got {len(value)}"
+                )
+
+            # Use NDArray for supported numeric dtypes, else return list
+            if dtype_info is not None:
+                return np.array(value, dtype=dtype_info.numpy_dtype)
+            return value
+
+        return decode_vector
+
     return lambda value: value
 
 
diff --git a/python/cocoindex/functions.py b/python/cocoindex/functions.py
@@ -1,6 +1,8 @@
 """All builtin functions."""
 
-from typing import Annotated, Any, TYPE_CHECKING
+from typing import Annotated, Any, TYPE_CHECKING, Literal
+import numpy as np
+from numpy.typing import NDArray
 import dataclasses
 
 from .typing import Float32, Vector, TypeAttr
@@ -66,11 +68,11 @@ def analyze(self, text: Any) -> type:
         self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
         dim = self._model.get_sentence_embedding_dimension()
         result: type = Annotated[
-            Vector[Float32, dim],  # type: ignore
+            Vector[np.float32, Literal[dim]],  # type: ignore
             TypeAttr("cocoindex.io/vector_origin_text", text.analyzed_value),
         ]
         return result
 
-    def __call__(self, text: str) -> list[Float32]:
-        result: list[Float32] = self._model.encode(text).tolist()
+    def __call__(self, text: str) -> NDArray[np.float32]:
+        result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
         return result
diff --git a/python/cocoindex/tests/test_convert.py b/python/cocoindex/tests/test_convert.py
diff --git a/python/cocoindex/tests/test_typing.py b/python/cocoindex/tests/test_typing.py
diff --git a/python/cocoindex/typing.py b/python/cocoindex/typing.py

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,9 @@ requires-python = ">=3.10"`
`6`	`6`	`dependencies = [`
`7`	`7`	`"cocoindex>=0.1.42",`
`8`	`8`	`"python-dotenv>=1.0.1",`
	`9`	`+ "pgvector>=0.4.1",`
`9`	`10`	`"psycopg[binary,pool]",`
	`11`	`+ "numpy",`
`10`	`12`	`]`
`11`	`13`
`12`	`14`	`[tool.setuptools]`