llm-with-rag/document_processor.py at main · infinitelambda/llm-with-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from chainlit.types import AskFileResponse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
import chainlit as cl

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=0)
embeddings = OpenAIEmbeddings()
text_splitter = SemanticChunker(embeddings)


def process_file(file: AskFileResponse):
    """
    Processes the uploaded file by splitting its content into smaller chunks.

    The function determines the file type (text or PDF), loads the file content,
    and then splits the content into smaller chunks using a text splitter.

    Parameters:
        file (AskFileResponse): The file uploaded by the user.

    Returns:
        List[Document]: A list of processed document chunks with updated metadata.
    """
    import tempfile

    if file.type == "text/plain":
        loader = TextLoader
    elif file.type == "application/pdf":
        loader = PyPDFLoader

    with tempfile.NamedTemporaryFile() as tempfile:
        tempfile.write(file.content)
        loader = loader(tempfile.name)
        documents = loader.load()
        documents_split = text_splitter.split_documents(documents)

        for i, doc in enumerate(documents_split):
            doc.metadata["source"] = f"source_{i}"

        return documents_split


def get_docsearch(file: AskFileResponse):
    """
    Creates a document search index from the processed file.

    This function processes the uploaded file, stores the processed document chunks
    in the user session, and creates a Chroma search index using the document chunks
    and embeddings.

    Parameters:
        file (AskFileResponse): The file uploaded by the user.

    Returns:
        Chroma: A Chroma search index created from the processed document chunks.
    """
    docs = process_file(file)

    cl.user_session.set("docs", docs)

    return Chroma.from_documents(docs, embeddings)


welcome_message = """Welcome to DocQuery! To get started:
1. Upload a PDF or text file
2. Ask a question about the file
"""


async def get_file_from_user():
    """
    Asynchronously prompts the user to upload a file and waits for the file upload.

    The function sends a message asking the user to upload a PDF or text file,
    and waits until the user uploads a file or the request times out.

    Returns:
        AskFileResponse: The file uploaded by the user.
    """
    await cl.Message(content="Hello! Please upload a pdf in order to ask questions about it.").send()

    files = None
    while files is None:
        files = await cl.AskFileMessage(
            content=welcome_message,
            accept=["text/plain", "application/pdf"],
            max_size_mb=20,
            timeout=180,
        ).send()

    return files[0]