|
1 | | -from typing import Tuple, List |
| 1 | +from typing import Tuple, List, Literal |
2 | 2 | from pathlib import Path |
| 3 | +from enum import Enum |
| 4 | +from typing_extensions import Annotated |
3 | 5 |
|
4 | 6 | from langchain_core.tools import tool |
5 | 7 | from langchain.vectorstores.base import VectorStore |
|
11 | 13 | DATA = BASE / "data" |
12 | 14 |
|
13 | 15 |
|
| 16 | +class SourceTag(str, Enum): |
| 17 | + TEXT = "text_chunk" |
| 18 | + IMAGE = "image_text" |
| 19 | + TABLE = "table_summary" |
| 20 | + |
| 21 | + |
14 | 22 | def initialize_retrieve_tool(vector_store: VectorStore): |
15 | 23 | @tool( |
16 | | - description="Retrieve information related to a query", |
| 24 | + description=( |
| 25 | + """ |
| 26 | + Semantic search over your docs. Valid tags are |
| 27 | + "text_chunk", "image_text", and "table_summary". |
| 28 | + """ |
| 29 | + ), |
17 | 30 | response_format="content_and_artifact", |
18 | 31 | ) |
19 | | - def retrieve(query: str) -> Tuple[str, List[Document]]: |
20 | | - retrieved_docs = vector_store.similarity_search(query, k=3) |
| 32 | + def retrieve( |
| 33 | + query: str, |
| 34 | + tag: Annotated[ |
| 35 | + Literal["text_chunk", "image_text", "table_summary"], |
| 36 | + """ |
| 37 | + Select between |
| 38 | + "text_chunk" (chunks over pdf, word, txt, etc), |
| 39 | + "image_text" (texts extracted through OCR per image), or |
| 40 | + "table_summary" (summary cards of excel sheets or csv files) |
| 41 | + """, |
| 42 | + ], |
| 43 | + ) -> Tuple[str, List[Document]]: |
| 44 | + """ |
| 45 | + Args: |
| 46 | + query: keywords or natural-language question. |
| 47 | + tag: which subset to search ("text_chunk", "image_text", "table_summary"). |
| 48 | + Returns: |
| 49 | + (summary_string, list_of_Documents) |
| 50 | + """ |
| 51 | + retrieved_docs = vector_store.similarity_search( |
| 52 | + query, |
| 53 | + filter={"source_type": tag}, |
| 54 | + k=2, |
| 55 | + ) |
21 | 56 | serialized = "\n\n".join( |
22 | 57 | (f"Source: {doc.metadata}\nContent: {doc.page_content}") |
23 | 58 | for doc in retrieved_docs |
|
0 commit comments