qa_chain_retrieval/utils.py at main · sadiahsaeed/qa_chain_retrieval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_openai import OpenAIEmbeddings
import os

from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore , Qdrant
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient, models

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.runnables import  RunnablePassthrough, RunnableParallel
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever

from langchain.document_loaders import TextLoader


chunk_size = 500
chunk_overlap = 50


def load_split_text_file(file):
    loader = TextLoader(file)
    documents = loader.load()
    documents_content = [doc.page_content for doc in documents]

    # Initialize the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    # Split the text into smaller chunks
    texts = text_splitter.create_documents(documents_content)

    return texts

def load_split_pdf_file(file):
    loader = PyMuPDFLoader(file)
    pages = loader.load()
    pdf_page_content = [page.page_content for page in pages]

    # Initialize the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function=len,
        #separators= ["\n\n", "\n", " ", ""]
    )

    # Split the text into smaller chunks
    chunks = text_splitter.create_documents(pdf_page_content)


    return chunks

def load_split_docx_file(file):
    loader = UnstructuredWordDocumentLoader(str(file))
    documents = loader.load()
    word_docx_content = [doc.page_content for doc in documents]
    textsplit = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size, chunk_overlap = chunk_overlap, length_function=len)

    doc_chunks = textsplit.create_documents(word_docx_content)

    return doc_chunks

class QdrantInsertRetrievalAll:
    def __init__(self,api_key,url):
        self.url = url
        self.api_key = api_key

    # Method to insert documents into Qdrant vector store
    def insertion(self,text,embeddings,collection_name):
        qdrant = QdrantVectorStore.from_documents(
        text,
        embeddings,
        url=self.url,
        prefer_grpc=False,
        api_key=self.api_key,
        collection_name=collection_name,
        force_recreate=True
        )
        print("insertion successfull")
        return qdrant


    # Method to retrieve documents from Qdrant vector store
    def retrieval(self,collection_name,embeddings):
        qdrant_client = QdrantClient(
        url=self.url,
        api_key=self.api_key,
        )
        qdrant_store = Qdrant(qdrant_client,collection_name=collection_name ,embeddings=embeddings)
        return qdrant_store


def QA_Chain_Retrieval(query, qdrant_vectordb):
    try:
        # Formatting function for documents
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        # Prompt template string
        prompt_str = """
        you are expert chatbot assistant. you cannot generate response other than provided context. if the response is not in the provided context then print("For this query there is no information in the uploaded documents.")
        {context}

        Question: {question}
        """

        # Create a chat prompt template
        _prompt = ChatPromptTemplate.from_template(prompt_str)

        # Set the number of chunks to retrieve
        num_chunks = 10

        # Set up the retriever
        retriever = qdrant_vectordb.as_retriever(
            search_type="similarity",
            search_kwargs={"k": num_chunks}
        )

        # Set up the chain components
        chat_llm = ChatOpenAI(model_name="gpt-4o-mini")
        query_fetcher = itemgetter("question")
        setup = {
            "question": query_fetcher,
            "context": query_fetcher | retriever | format_docs
        }

        # Define the final chain
        _chain = setup | _prompt | chat_llm

        # Execute the chain and fetch the response
        response = _chain.invoke({"question": query})
        return response

    except Exception as e:
        return f"Error executing retrieval chain: {str(e)}"