|
| 1 | +from chromadb.api.types import IncludeEnum |
| 2 | +from models import embed_func |
| 3 | +from logger import setup_logger |
| 4 | + |
| 5 | +from alkemio_virtual_contributor_engine.chromadb_client import chromadb_client |
| 6 | + |
| 7 | + |
| 8 | +logger = setup_logger(__name__) |
| 9 | + |
| 10 | + |
| 11 | +def combine_documents(docs, document_separator="\n\n"): |
| 12 | + chunks_array = [] |
| 13 | + for index, document in enumerate(docs["documents"][0]): |
| 14 | + chunks_array.append(f"[source:{index}] {document}") |
| 15 | + |
| 16 | + return document_separator.join(chunks_array) |
| 17 | + |
| 18 | + |
| 19 | +def get_documents(message: str): |
| 20 | + |
| 21 | + collections = [ |
| 22 | + "alkem.io-knowledge", |
| 23 | + "welcome.alkem.io-knowledge", |
| 24 | + "www.alkemio.org-knowledge", |
| 25 | + ] |
| 26 | + result = {"documents": [[]], "metadatas": [[]], "distances": [[]]} |
| 27 | + |
| 28 | + for collection in collections: |
| 29 | + collection = chromadb_client.get_collection( |
| 30 | + collection, embedding_function=embed_func |
| 31 | + ) |
| 32 | + tmp_result = collection.query( |
| 33 | + query_texts=[message], |
| 34 | + include=[ |
| 35 | + IncludeEnum.documents, |
| 36 | + IncludeEnum.metadatas, |
| 37 | + IncludeEnum.distances, |
| 38 | + ], |
| 39 | + n_results=3, |
| 40 | + ) |
| 41 | + if ( |
| 42 | + tmp_result |
| 43 | + and tmp_result["documents"] |
| 44 | + and tmp_result["distances"] |
| 45 | + and tmp_result["metadatas"] |
| 46 | + ): |
| 47 | + result["distances"][0] += tmp_result["distances"][0] |
| 48 | + result["documents"][0] += tmp_result["documents"][0] |
| 49 | + result["metadatas"][0] += tmp_result["metadatas"][0] |
| 50 | + return result |
| 51 | + |
| 52 | + |
| 53 | +def create_context(message): |
| 54 | + documents = get_documents(message) |
| 55 | + logger.info("Context retrieved.") |
| 56 | + logger.debug(f"Context is {documents}") |
| 57 | + return documents, combine_documents(documents) |
0 commit comments