From 7462cc0608a229ecb9366645a8010100a7341d34 Mon Sep 17 00:00:00 2001 From: Christopher McLaughlin Date: Fri, 10 Jan 2025 10:19:08 +0000 Subject: [PATCH 1/2] feat: Add OpenSearch Binary & Byte data_type support --- llama-index-core/llama_index/core/schema.py | 2 +- .../vector_stores/opensearch/base.py | 45 ++- .../tests/test_opensearch_client.py | 332 ++++++++++++++++++ 3 files changed, 367 insertions(+), 12 deletions(-) diff --git a/llama-index-core/llama_index/core/schema.py b/llama-index-core/llama_index/core/schema.py index 045f7122c0..e32bacae21 100644 --- a/llama-index-core/llama_index/core/schema.py +++ b/llama-index-core/llama_index/core/schema.py @@ -269,7 +269,7 @@ class BaseNode(BaseComponent): id_: str = Field( default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the node." ) - embedding: Optional[List[float]] = Field( + embedding: Optional[List[Union[float, int]]] = Field( default=None, description="Embedding of the node." ) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/llama_index/vector_stores/opensearch/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/llama_index/vector_stores/opensearch/base.py index 8fdd0f5c3a..ac0f792c18 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/llama_index/vector_stores/opensearch/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/llama_index/vector_stores/opensearch/base.py @@ -32,6 +32,16 @@ ) MATCH_ALL_QUERY = {"match_all": {}} # type: Dict +VALID_DATA_TYPES = ["float", "byte", "binary"] +BYTE_VECTOR_ENGINES = ["lucene", "faiss"] +BINARY_VECTOR_ENGINE = "faiss" +INVALID_BYTE_VECTOR_ENGINE = ( + "Byte vectors only support 'lucene' or 'faiss' as the engine type." +) +INVALID_DATA_TYPE = f"Data type must be one of {VALID_DATA_TYPES}" +INVALID_BINARY_ENGINE = "Binary vectors must use 'faiss' as the engine type" +INVALID_BINARY_SPACE_TYPE = "Binary vectors must use 'hamming' as the space type" + class OpensearchVectorClient: """ @@ -48,18 +58,12 @@ class OpensearchVectorClient: embedding_field (str): Name of the field in the index to store embedding array in. text_field (str): Name of the field to grab text from + data_type (str): Type of vector data. One of ["float", "byte", "binary"] method (Optional[dict]): Opensearch "method" JSON obj for configuring the KNN index. - This includes engine, metric, and other config params. Defaults to: - {"name": "hnsw", "space_type": "l2", "engine": "nmslib", - "parameters": {"ef_construction": 256, "m": 48}} - settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to: - {"index": {"knn": True, "knn.algo_param.ef_search": 100}} - space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2 - os_client (Optional[OSClient]): Custom synchronous client (see OpenSearch from opensearch-py) - os_async_client (Optional[OSClient]): Custom asynchronous client (see AsyncOpenSearch from opensearch-py) - **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py. - + This includes engine, metric, and other config params. + space_type (Optional[str]): space type for distance metric calculation. + **kwargs: Optional arguments passed to the OpenSearch client. """ def __init__( @@ -69,6 +73,7 @@ def __init__( dim: int, embedding_field: str = "embedding", text_field: str = "content", + data_type: str = "float", method: Optional[dict] = None, settings: Optional[dict] = None, engine: Optional[str] = "nmslib", @@ -80,10 +85,26 @@ def __init__( **kwargs: Any, ): """Init params.""" + if method is not None: + engine = method.get("engine", engine) + space_type = method.get("space_type", space_type) + + if data_type not in VALID_DATA_TYPES: + raise ValueError(INVALID_DATA_TYPE) + + if data_type == "byte" and engine not in BYTE_VECTOR_ENGINES: + raise ValueError(INVALID_BYTE_VECTOR_ENGINE) + + if data_type == "binary": + if engine != BINARY_VECTOR_ENGINE: + raise ValueError(INVALID_BINARY_ENGINE) + if space_type != "hamming": + raise ValueError(INVALID_BINARY_SPACE_TYPE) + # Default method configuration if method is None: method = { "name": "hnsw", - "space_type": "l2", + "space_type": space_type, "engine": engine, "parameters": {"ef_construction": 256, "m": 48}, } @@ -99,6 +120,7 @@ def __init__( self._index = index self._text_field = text_field self._max_chunk_bytes = max_chunk_bytes + self._data_type = data_type self._search_pipeline = search_pipeline http_auth = kwargs.get("http_auth") @@ -112,6 +134,7 @@ def __init__( embedding_field: { "type": "knn_vector", "dimension": dim, + "data_type": data_type, "method": method, }, } diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/tests/test_opensearch_client.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/tests/test_opensearch_client.py index f0318d3b7a..17a1544471 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/tests/test_opensearch_client.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/tests/test_opensearch_client.py @@ -20,6 +20,8 @@ VectorStoreQuery, ) +from opensearchpy import exceptions + ## # Start Opensearch locally # cd tests @@ -850,3 +852,333 @@ def test_efficient_filtering_used_when_enabled(os_stores: List[OpensearchVectorS embedding_field="embedding", query_embedding=[1], k=20, filters=filters ) assert patched_default_approximate_search_query.called + + +@pytest.mark.skipif(opensearch_not_available, reason="opensearch is not available") +def test_binary_vector_initialisation() -> None: + """Test binary vector initialisation with valid and invalid configurations.""" + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=8, + data_type="binary", + engine="faiss", + space_type="hamming", + ) + assert client._data_type == "binary" + assert client._method["engine"] == "faiss" + assert client._method["space_type"] == "hamming" + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + with pytest.raises( + exceptions.RequestError, + match="Dimension should be multiply of 8 for binary vector data type", + ): + OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=2, + data_type="binary", + engine="faiss", + space_type="hamming", + ) + + with pytest.raises( + ValueError, match="Binary vectors must use 'faiss' as the engine type" + ): + OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=8, + data_type="binary", + engine="lucene", + space_type="hamming", + ) + + with pytest.raises( + ValueError, match="Binary vectors must use 'hamming' as the space type" + ): + OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=8, + data_type="binary", + engine="faiss", + space_type="l2", + ) + + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=16, + data_type="binary", + engine="faiss", + space_type="hamming", + ) + assert client._data_type == "binary" + assert client._method["engine"] == "faiss" + assert client._method["space_type"] == "hamming" + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + custom_method = { + "name": "hnsw", + "space_type": "hamming", + "engine": "faiss", + "parameters": {"ef_construction": 128, "m": 24}, + } + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=16, + data_type="binary", + method=custom_method, + ) + assert client._data_type == "binary" + assert client._method == custom_method + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + +@pytest.fixture() +def binary_vector_store() -> Generator[OpensearchVectorStore, None, None]: + """Fixture for binary vector store testing.""" + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=16, + data_type="binary", + method={ + "name": "hnsw", + "space_type": "hamming", + "engine": "faiss", + "parameters": {"ef_construction": 256, "m": 48}, + }, + ) + + yield OpensearchVectorStore(client) + + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + +@pytest.mark.skipif(opensearch_not_available, reason="opensearch is not available") +def test_binary_vector_functionality( + binary_vector_store: OpensearchVectorStore, +) -> None: + """Test basic functionality with binary vectors.""" + nodes = [ + TextNode( + text="test1", + id_="test1", + embedding=[108, -116], + ), + TextNode( + text="test2", + id_="test2", + embedding=[-128, 127], + ), + ] + + assert len(binary_vector_store.add(nodes)) == len(nodes) + + query = VectorStoreQuery(query_embedding=[127, 15], similarity_top_k=1) + result = binary_vector_store.query(query) + assert result.nodes + assert result.nodes[0].get_content() == "test1" + + +@pytest.mark.asyncio() +@pytest.mark.skipif(opensearch_not_available, reason="opensearch is not available") +async def test_binary_vector_async_functionality( + binary_vector_store: OpensearchVectorStore, +) -> None: + """Test async functionality with binary vectors.""" + nodes = [ + TextNode( + text="test1", + id_="test1", + embedding=[108, -116], + ), + TextNode( + text="test2", + id_="test2", + embedding=[-128, 127], + ), + ] + + assert len(await binary_vector_store.async_add(nodes)) == len(nodes) + + query = VectorStoreQuery(query_embedding=[127, 15], similarity_top_k=1) + result = await binary_vector_store.aquery(query) + assert result.nodes + assert result.nodes[0].get_content() == "test1" + + +@pytest.mark.skipif(opensearch_not_available, reason="opensearch is not available") +def test_byte_vector_initialisation() -> None: + """Test byte vector initialisation with valid and invalid configurations.""" + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=4, + data_type="byte", + engine="faiss", + space_type="l2", + ) + + assert client._data_type == "byte" + assert client._method["engine"] == "faiss" + assert client._method["space_type"] == "l2" + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=4, + data_type="byte", + engine="lucene", + space_type="l2", + ) + assert client._data_type == "byte" + assert client._method["engine"] == "lucene" + assert client._method["space_type"] == "l2" + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + with pytest.raises( + ValueError, + match="Byte vectors only support 'lucene' or 'faiss' as the engine type", + ): + OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=4, + data_type="byte", + engine="nmslib", + space_type="l2", + ) + + custom_method = { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": {"ef_construction": 128, "m": 24}, + } + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=4, + data_type="byte", + method=custom_method, + ) + assert client._data_type == "byte" + assert client._method == custom_method + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + +@pytest.fixture() +def byte_vector_store_faiss() -> Generator[OpensearchVectorStore, None, None]: + """Fixture for byte vector store testing with faiss engine.""" + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=4, + data_type="byte", + method={ + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": {"ef_construction": 256, "m": 48}, + }, + ) + + yield OpensearchVectorStore(client) + + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + +@pytest.fixture(params=["faiss", "lucene"]) +def byte_vector_store(request) -> Generator[OpensearchVectorStore, None, None]: + """Fixture for byte vector store testing that alternates between engines.""" + client = OpensearchVectorClient( + endpoint="localhost:9200", + index=f"test_{uuid.uuid4().hex}", + dim=4, + data_type="byte", + method={ + "name": "hnsw", + "space_type": "l2", + "engine": request.param, + "parameters": {"ef_construction": 256, "m": 48}, + }, + ) + + yield OpensearchVectorStore(client) + + client._os_client.indices.delete(index=client._index) + client._os_client.close() + client._os_async_client.close() + + +@pytest.mark.skipif(opensearch_not_available, reason="opensearch is not available") +def test_byte_vector_functionality(byte_vector_store: OpensearchVectorStore) -> None: + """Test basic functionality with byte vectors using both engines.""" + nodes = [ + TextNode( + text="test1", + id_="test1", + embedding=[108, -116, 75, -90], + ), + TextNode( + text="test2", + id_="test2", + embedding=[-128, 127, 0, 64], + ), + ] + + assert len(byte_vector_store.add(nodes)) == len(nodes) + + query = VectorStoreQuery(query_embedding=[108, -116, 75, -90], similarity_top_k=1) + result = byte_vector_store.query(query) + assert result.nodes + assert result.nodes[0].get_content() == "test1" + + +@pytest.mark.asyncio() +@pytest.mark.skipif(opensearch_not_available, reason="opensearch is not available") +async def test_byte_vector_async_functionality( + byte_vector_store: OpensearchVectorStore, +) -> None: + """Test async functionality with byte vectors using both engines.""" + nodes = [ + TextNode( + text="test1", + id_="test1", + embedding=[108, -116, 75, -90], + ), + TextNode( + text="test2", + id_="test2", + embedding=[-128, 127, 0, 64], + ), + ] + + assert len(await byte_vector_store.async_add(nodes)) == len(nodes) + + query = VectorStoreQuery(query_embedding=[108, -116, 75, -90], similarity_top_k=1) + result = await byte_vector_store.aquery(query) + assert result.nodes + assert result.nodes[0].get_content() == "test1" From f61f8a640fbeacf2773b9814b048ed8f6f96c976 Mon Sep 17 00:00:00 2001 From: Christopher McLaughlin Date: Fri, 10 Jan 2025 10:23:56 +0000 Subject: [PATCH 2/2] version-dump --- .../llama-index-vector-stores-opensearch/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/pyproject.toml index 6231794f8b..4710eab353 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/pyproject.toml @@ -27,7 +27,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-vector-stores-opensearch" readme = "README.md" -version = "0.5.2" +version = "0.6.0" [tool.poetry.dependencies] python = ">=3.9,<4.0"