From 48e190c6fee69221055e53df8f25ee07099c9ca5 Mon Sep 17 00:00:00 2001 From: azat-manukyan Date: Sat, 9 Nov 2024 02:52:54 +0400 Subject: [PATCH 1/2] wip --- llama_index/vector_stores/deeplake.py | 174 +++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 1 deletion(-) diff --git a/llama_index/vector_stores/deeplake.py b/llama_index/vector_stores/deeplake.py index a3f40cf154c..4c3c3fbbd78 100644 --- a/llama_index/vector_stores/deeplake.py +++ b/llama_index/vector_stores/deeplake.py @@ -18,7 +18,179 @@ ) try: - from deeplake.core.vectorstore import VectorStore + import deeplake + + if deeplake.__version__.startswith("3."): + from deeplake.core.vectorstore import VectorStore + else: + + class VectorStore: + def __init__( + self, + path: str, + embedding_function: Optional[Embeddings] = None, + read_only: bool = False, + token: Optional[str] = None, + exec_option: Optional[str] = None, + verbose: bool = False, + runtime: Optional[Dict] = None, + index_params: Optional[Dict[str, Union[int, str]]] = None, + **kwargs: Any, + ): + if _DEEPLAKE_INSTALLED is False: + raise ImportError( + "Could not import deeplake python package. " + "Please install it with `pip install deeplake[enterprise]`." + ) + self.path = path + self.embedding_function = embedding_function + self.read_only = read_only + self.token = token + self.exec_options = exec_option + self.verbose = verbose + self.runtime = runtime + self.index_params = index_params + self.kwargs = kwargs + if read_only: + self.ds = deeplake.open_read_only(self.path, self.token) + else: + try: + self.ds = deeplake.open(self.path, self.token) + except deeplake.LogNotexistsError: + self.__create_dataset() + + def tensors(self) -> list[str]: + return [c.name for c in self.ds.schema.columns] + + def add( + self, + text: List[str], + metadata: Optional[List[dict]], + embedding_data: Iterable[str], + embedding_tensor: str, + embedding_function: Optional[Callable], + return_ids: bool, + **tensors: Any, + ) -> Optional[list[str]]: + if embedding_function is not None: + embedding_data = embedding_function(text) + if embedding_tensor is not None: + embedding_tensor = "embedding" + _id = ( + tensors["id"] + if "id" in tensors + else [str(uuid.uuid1()) for _ in range(len(text))] + ) + self.ds.append( + { + "text": text, + "metadata": metadata, + "id": _id, + embedding_tensor: embedding_data, + } + ) + self.ds.commit() + if return_ids: + return _id + else: + return None + + def search_tql( + self, query: str, exec_options: Optional[str] + ) -> Dict[str, Any]: + view = self.ds.query(query) + return self.__view_to_docs(view) + + def search( + self, + embedding: Union[str, List[float]], + k: int, + distance_metric: str, + filter: Optional[Dict[str, Any]], + exec_option: Optional[str], + return_tensors: List[str], + deep_memory: Optional[bool], + query: Optional[str] = None, + ) -> Dict[str, Any]: + if query is None and embedding is None: + raise ValueError( + "Both `embedding` and `query` were specified." + " Please specify either one or the other." + ) + if query is not None: + return self.search_tql(query, exec_option) + + if isinstance(embedding, str): + if self.embedding_function is None: + raise ValueError( + "embedding_function is required when embedding is a string" + ) + embedding = self.embedding_function.embed_documents([embedding])[0] + emb_str = ", ".join([str(e) for e in embedding]) + + column_list = " * " if return_tensors else ", ".join(return_tensors) + + metric = self.__metric_to_function(distance_metric) + order_by = " ASC " + if metric == "cosine_similarity": + order_by = " DESC " + dp = f"(embedding, ARRAY[{emb_str}])" + column_list += ( + f", {self.__metric_to_function(distance_metric)}{dp} as score" + ) + mf = self.__metric_to_function(distance_metric) + query = f"SELECT {column_list} ORDER BY {mf}{dp} {order_by} LIMIT {k}" + view = self.ds.query(query) + return self.__view_to_docs(view) + + def delete( + self, ids: List[str], filter: Dict[str, Any], delete_all: bool + ) -> None: + raise NotImplementedError + + def dataset(self) -> Any: + return self.ds + + def __view_to_docs(self, view: Any) -> Dict[str, Any]: + docs = {} + tenors = [(c.name, str(c.dtype)) for c in view.schema.columns] + for name, type in tenors: + if type == "dict": + docs[name] = [i.to_dict() for i in view[name][:]] + else: + try: + docs[name] = view[name][:].tolist() + except AttributeError: + docs[name] = view[name][:] + return docs + + def __metric_to_function(self, metric: str) -> str: + if ( + metric is None + or metric == "cosine" + or metric == "cosine_similarity" + ): + return "cosine_similarity" + elif metric == "l2" or metric == "l2_norm": + return "l2_norm" + else: + raise ValueError( + f"Unknown metric: {metric}, should be one of " + "['cosine', 'cosine_similarity', 'l2', 'l2_norm']" + ) + + def __create_dataset(self) -> None: + if self.embedding_function is None: + raise ValueError( + "embedding_function is required to create a new dataset" + ) + emb_size = len(self.embedding_function.embed_documents(["test"])[0]) + self.ds = deeplake.create(self.path, self.token) + self.ds.add_column("text", deeplake.types.Text("inverted")) + self.ds.add_column("metadata", deeplake.types.Dict()) + self.ds.add_column("embedding", deeplake.types.Embedding(size=emb_size)) + self.ds.add_column("id", deeplake.types.Text) + self.ds.commit() DEEPLAKE_INSTALLED = True except ImportError: From fb1b89cd12f37bdc91912cdc29cfd93e73e62e47 Mon Sep 17 00:00:00 2001 From: azat-manukyan Date: Sat, 9 Nov 2024 04:24:06 +0400 Subject: [PATCH 2/2] minor fix --- llama_index/vector_stores/deeplake.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llama_index/vector_stores/deeplake.py b/llama_index/vector_stores/deeplake.py index 4c3c3fbbd78..29ac2a047fe 100644 --- a/llama_index/vector_stores/deeplake.py +++ b/llama_index/vector_stores/deeplake.py @@ -4,7 +4,7 @@ """ import logging -from typing import Any, List, Optional, cast +from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast from llama_index.schema import BaseNode, MetadataMode from llama_index.vector_stores.types import VectorStore as VectorStoreBase @@ -28,7 +28,6 @@ class VectorStore: def __init__( self, path: str, - embedding_function: Optional[Embeddings] = None, read_only: bool = False, token: Optional[str] = None, exec_option: Optional[str] = None, @@ -43,7 +42,6 @@ def __init__( "Please install it with `pip install deeplake[enterprise]`." ) self.path = path - self.embedding_function = embedding_function self.read_only = read_only self.token = token self.exec_options = exec_option @@ -192,9 +190,9 @@ def __create_dataset(self) -> None: self.ds.add_column("id", deeplake.types.Text) self.ds.commit() - DEEPLAKE_INSTALLED = True + _DEEPLAKE_INSTALLED = True except ImportError: - DEEPLAKE_INSTALLED = False + _DEEPLAKE_INSTALLED = False logger = logging.getLogger(__name__) @@ -267,7 +265,7 @@ def __init__( self.read_only = read_only self.dataset_path = dataset_path - if not DEEPLAKE_INSTALLED: + if not _DEEPLAKE_INSTALLED: raise ImportError( "Could not import deeplake python package. " "Please install it with `pip install deeplake`."