diff --git a/Homepage.py b/Homepage.py new file mode 100644 index 00000000..54602a25 --- /dev/null +++ b/Homepage.py @@ -0,0 +1,60 @@ +import streamlit as st +from PIL import Image + +img = Image.open("images/affine.jpg") + +page_config = {"page_title":"Contract_comparison_tool.io","page_icon":img,"layout":"wide"} + +st.set_page_config(**page_config) + +## Divide the user interface into two parts: column 1 (small) and column 2 (large). +#"""This code assigns the st.columns([1, 8]) statement to the variables col1 and col2, +#which divide the user interface into two columns. Column 1 will be smaller in width, +# while column 2 will be larger. +#""" + +hide_streamlit_style = """ + + """ +st.markdown(hide_streamlit_style, unsafe_allow_html=True) + +col1, col2,col3,col4 = st.columns([2.5,2.5,8.5,0.5]) +with col1: + st.write(' ') +with col2: + #img = Image.open("images/affine.jpg") + st.image(img,width = 130) + +with col3: + st.markdown(""" +

+ + Contract Comparator + +""", unsafe_allow_html=True) +with col4: + st.write(' ') + +st.write("\n") +st.write("\n") +st.write("\n") +st.write("\n") +st.write("\n") +st.write("\n") +st.write("**It is a tool that can compare two different versions of the same contract and mention all the major changes that were made between the two documents .**") + + + # key=col1._text_input() + # Left column: Upload PDF text + # st.header("Dashboard") + + + +# Extract the text from uploaded pdf diff --git a/README.md b/README.md index 5178ee42..c74d4c35 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,85 @@ -# oneAPI-GenAI-Hackathon-2023 - Hack2Skill +#### Team Name - Team Affine +#### Problem Statement - Generative AI Large Language Models Fine Tuned For Legal Practice Platform +#### Team Leader Email - jayanth.ajay@affine.ai -Welcome to the official repository for the oneAPI-GenAI-Hackathon-2023 organized by Hack2Skill! - -## Getting Started - -To get started with the oneAPI-GenAI-Hackathon-2023 repository, follow these steps: - -### Submission Instruction: - 1. Fork this repository - 2. Create a folder with your Team Name - 3. Upload all the code and necessary files in the created folder - 4. Upload a **README.md** file in your folder with the below mentioned informations. - 5. Generate a Pull Request with your Team Name. (Example: submission-XYZ_team) - -### README.md must consist of the following information: +### A Brief of the Prototype: -#### Team Name - -#### Problem Statement - -#### Team Leader Email - + ![Image](https://github.com/bhaskarturkar/oneAPI-GenAI-Hackathon-2023/blob/main/process-flow-diagram.JPG) -### A Brief of the Prototype: - This section must include UML Diagrams and prototype description ### Tech Stack: - List Down all technologies used to Build the prototype + Technologies used (Mark down oneAPI AI Analytics libraries used)
+1.Python
+2.Hugging Face
+3.ChromaDB
+4.Langchain
+5.Open-docx
+6.PyPDF
+7.Pytorch
+ +8.Intel OpenDNN
+9.Intel OneAPI Base Toolkit
+ ### Step-by-Step Code Execution Instructions: - This Section must contain a set of instructions required to clone and run the prototype so that it can be tested and deeply analyzed - + +#### Getting Started +These instructions will guide you through setting up your environment and running the project. + +#### Prerequisites +1.An IDC compute instance
+2.SSH access to the instance
+3.Miniconda (instructions included for installation)
+#### Installation and Setup +1. SSH into the IDC Compute Instance +To start, SSH into your IDC compute instance. Replace with your actual instance IP address. +``` +ssh username@ +``` +2. Install Miniconda +Once logged in, install Miniconda for a simplified Python environment management. Run the following: +``` +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh +``` +Follow the on-screen instructions to complete the installation. + +3. Create and Activate a Conda Environment +Create a new Conda environment with Python version 3.10.6: + +``` +conda create -n myenv python=3.10.6 +conda activate myenv +``` +This will create and activate a new environment named myenv. + +4. Install Required Modules +Install the required modules specified in requirements.txt: +``` +pip install -r requirements.txt +``` +Ensure requirements.txt is present in your current directory. + +5. Accessing Streamlit UI and Port Forwarding +To access the Streamlit UI, use SSH tunneling for port forwarding. Run the following command on your local machine: +``` +streamlit run Homepage.py +``` +``` +ssh -L 8501:localhost:8501 username@ +``` +Then, you can access the Streamlit UI by navigating to localhost:8501 in your web browser. + +#### Working demo +[Working demo of contract comparator](https://vimeo.com/891854466) + +Contributing +Guidelines for contributing to this repository, if applicable. + + + +Note: Replace placeholders (like ) with actual values relevant to your project. + + ### Future Scope: - Write about the scalability and futuristic aspects of the prototype developed + A Fine tuning a bigger model using proper relevant dataset may improve the results. diff --git a/data/Contracts/MSA v1.pdf b/data/Contracts/MSA v1.pdf new file mode 100644 index 00000000..08f6720f Binary files /dev/null and b/data/Contracts/MSA v1.pdf differ diff --git a/data/Contracts/MSA v2_1.pdf b/data/Contracts/MSA v2_1.pdf new file mode 100644 index 00000000..ef000f75 Binary files /dev/null and b/data/Contracts/MSA v2_1.pdf differ diff --git a/images/affine.jpg b/images/affine.jpg new file mode 100644 index 00000000..77406764 Binary files /dev/null and b/images/affine.jpg differ diff --git a/pages/compare_doc.py b/pages/compare_doc.py new file mode 100644 index 00000000..732ee8c9 --- /dev/null +++ b/pages/compare_doc.py @@ -0,0 +1,401 @@ +# Import necessary libraries +import sys +import site +from pathlib import Path +import os +import logging +import random +import re +import warnings +import torch +import intel_extension_for_pytorch as ipex + +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # ,BitsAndBytesConfig +from transformers import LlamaTokenizer, LlamaForCausalLM +from langchain.llms.huggingface_pipeline import HuggingFacePipeline + +from langchain.callbacks.manager import CallbackManager +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler +from langchain.chains import LLMChain +from langchain.chains import RetrievalQA +from langchain.chains.question_answering import load_qa_chain +from langchain.indexes import VectorstoreIndexCreator +import base64 +import streamlit as st +from pathlib import Path + +from langchain.embeddings import HuggingFaceEmbeddings +from langchain.globals import set_debug +from langchain.chains import RetrievalQA +from langchain.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import Chroma +from langchain.prompts import PromptTemplate +from prettytable import PrettyTable as pt +from dotenv import load_dotenv +import time +from langchain.document_loaders import PyPDFLoader +from PIL import Image +import pathlib +from PyPDF2 import PdfReader +import src.entity as entity +from src.entity import QAModel +import shutil +import logging +from difflib import SequenceMatcher +from langchain.globals import set_verbose, set_debug +set_verbose(True) +set_debug(True) + + +# Suppress warnings for a cleaner output +warnings.filterwarnings("ignore") +# os.environ['TRANSFORMERS_CACHE'] = './my_llama_model' +os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1" +os.environ["ENABLE_SDP_FUSION"] = "1" + +embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") +# Get Python version +def get_python_version(): + return "python" + ".".join(map(str, sys.version_info[:2])) + +# Set local binary path for system paths +def set_local_bin_path(): + local_bin = str(Path.home() / ".local" / "bin") + local_site_packages = str( + Path.home() / ".local" / "lib" / get_python_version() / "site-packages" + ) + sys.path.append(local_bin) + sys.path.insert(0, site.getusersitepackages()) + sys.path.insert(0, sys.path.pop(sys.path.index(local_site_packages))) + +set_local_bin_path() + +# Check if Torch XPU is available and set seed +if torch.xpu.is_available(): + seed = 88 + random.seed(seed) + torch.xpu.manual_seed(seed) + torch.xpu.manual_seed_all(seed) + +def select_device(preferred_device=None): + """ + Selects the best available XPU device or the preferred device if specified. + + Args: + preferred_device (str, optional): Preferred device string (e.g., "cpu", "xpu", "xpu:0", "xpu:1", etc.). If None, a random available XPU device will be selected or CPU if no XPU devices are available. + + Returns: + torch.device: The selected device object. + """ + try: + if preferred_device and preferred_device.startswith("cpu"): + print("Using CPU.") + return torch.device("cpu") + if preferred_device and preferred_device.startswith("xpu"): + if preferred_device == "xpu" or ( + ":" in preferred_device + and int(preferred_device.split(":")[1]) >= torch.xpu.device_count() + ): + preferred_device = ( + None # Handle as if no preferred device was specified + ) + else: + device = torch.device(preferred_device) + if device.type == "xpu" and device.index < torch.xpu.device_count(): + vram_used = torch.xpu.memory_allocated(device) / ( + 1024**2 + ) # In MB + print( + f"Using preferred device: {device}, VRAM used: {vram_used:.2f} MB" + ) + return device + + if torch.xpu.is_available(): + device_id = random.choice( + range(torch.xpu.device_count()) + ) # Select a random available XPU device + device = torch.device(f"xpu:{device_id}") + vram_used = torch.xpu.memory_allocated(device) / (1024**2) # In MB + print(f"Selected device: {device}, VRAM used: {vram_used:.2f} MB") + return device + except Exception as e: + print(f"An error occurred while selecting the device: {e}") + print("No XPU devices available or preferred device not found. Using CPU.") + return torch.device("cpu") + +########################################################################################### +# Cache path for the model +MODEL_CACHE_PATH = "./" + +# Model name to be used +model_name = "perlthoughts/Falkor-11b" + +model= AutoModelForCausalLM.from_pretrained( + model_name, + low_cpu_mem_usage=True, + trust_remote_code=True, + torch_dtype=torch.bfloat16,cache_dir="./").to("cpu").eval() #load_in_8bit=True + +# IPEX for Quantization and Optimization +qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=torch.qint8, # or torch.quint4x2 + lowp_mode=ipex.quantization.WoqLowpMode.NONE, # or FP16, BF16, INT8 +) + +checkpoint = None # optionally load int4 or int8 checkpoint #"falkor-11b-q8_0.gguf" # +model = ipex.optimize_transformers(model, quantization_config=qconfig, low_precision_checkpoint=checkpoint) +tokenizer = AutoTokenizer.from_pretrained( + model_name, trust_remote_code=True,cache_dir="./" ) + + +def generate_text(): + gt = pipeline(task="text-generation", model=model, tokenizer=tokenizer) + + return gt + + +hf_pipeline = HuggingFacePipeline(pipeline=generate_text()) + + +token = "your hugging face token" + +# Image for the Streamlit page +img = Image.open("images/affine.jpg") + +# Streamlit page configuration +page_config = {"page_title":"invoice_tool.io","page_icon":img,"layout":"wide"} +st.set_page_config(**page_config) + +## Divide the user interface into two parts: column 1 (small) and column 2 (large). +#"""This code assigns the st.columns([1, 8]) statement to the variables col1 and col2, +#which divide the user interface into two columns. Column 1 will be smaller in width, +# while column 2 will be larger. +#""" + +hide_streamlit_style = """ + + """ +st.markdown(hide_streamlit_style, unsafe_allow_html=True) + +col1, col2,col3,col4 = st.columns([2.5,2.5,8.5,0.5]) +with col1: + st.write(' ') +with col2: + #img = Image.open("images/affine.jpg") + st.image(img,width = 130) + +with col3: + st.markdown(""" +

+ + Contract Comparator + +

+ +""", unsafe_allow_html=True) + + st.write("**Get Comparison of two contracts**") + +with col4: + st.write(' ') + +st.write("\n") +st.write("\n") + +path = "data/Contracts/" + +with st.sidebar: + st.markdown(""" +
+

+ + Contract Comparator + + + + +

+
+""", unsafe_allow_html=True) + + subdirs = os.listdir(path) + subdirs.insert(0,"Select2") + subdirs.insert(0,"Select1") + + print(subdirs) + doc1 = st.selectbox('**Pick the First document :**', sorted(subdirs), key="0") + # doc1, doc2 = "leave1.pdf", "leave2.pdf" + doc2 = st.selectbox('**Pick the Second document :**', sorted(subdirs), key="1") + if doc1 !="Select" and doc2 !="Select": + + st.write("**Click the below button if you want to compare two documents :**") + trigger_1 = st.button("Compare") + + st.write("\n") + st.write("\n") + +qa_model=QAModel() + +file_1 = f"{path}{doc1}" +file_2 = f"{path}{doc2}" + +footer = """Analytics""" +special_characters = r"[]{}()^$.*+?|\\" + + # Escape special characters by adding a backslash before them +escaped_string = re.sub(f"[{''.join(re.escape(char) for char in special_characters)}]", r"\\\g<0>", footer) +pattern = re.sub(r'\s+', r'\\s*', escaped_string) +replacement = " footer " + +if trigger_1 : + + if os.path.exists('.chroma'): + shutil.rmtree('.chroma') + + # Load file one and store the embeddings + + pdf_reader1=PyPDFLoader(file_1) + #documents1 = pdf_reader1.load() + if file_1 is not None: + pdf_reader = PdfReader(file_1) + print('PDF LOADED') + documents1 = "" + for page in pdf_reader.pages: + documents1 += ' '.join(page.extract_text().splitlines()) + # split into chunks + documents1 = re.sub(pattern, replacement, documents1) + document_chunks1=qa_model.document_splitter_assistant(documents1,user_input_chunk_size = 300,user_input_chunk_overlap = 30) + embedd_path = os.path.join("./chroma_db",doc1.split(".")[0]) + if not os.path.exists(embedd_path): + if not os.path.exists("./chroma_db"): + os.mkdir("./chroma_db") + os.mkdir(embedd_path) #+"/") + vectorstore_doc1 = qa_model.create_embedding_assistant(document_chunks1, embedd_path) + else: + vectorstore_doc1 = Chroma(persist_directory=embedd_path, embedding_function = embeddings) + pdf_reader2=PyPDFLoader(file_2) + if file_2 is not None: + pdf_reader = PdfReader(file_2) + print('PDF LOADED') + documents2 = "" + for page in pdf_reader.pages: + #result = re.sub(pattern, replacement, page.extract_text()) + documents2 += ' '.join(page.extract_text().splitlines()) + # split into chunks + + documents2 = re.sub(pattern, replacement, documents2) + document_chunks2=qa_model.document_splitter_assistant(documents2,user_input_chunk_size = 400,user_input_chunk_overlap = 30) + + + # Define the retraiver + # vectorstore_doc2=qa_model.create_embedding_assistant(document_chunks2) + st.write("Comparison : ") + st.write("\n") + changes =[] + changes_1 = [] + highlight_1 = [] + ct = 0 + # llm = model.generate("what is capital of india?") + + for i in document_chunks2: + print("*"*100, "Chunk Counter : ") + text = i + print(text) + st.write("chunkss") + st.write(i) + query= text + + custom_prompt_template =""" + You are an experienced legal consultant, your task is identifying the changes made in the latest version of the document as compared to the + previous one and mention those changes in the Output.\n + The document versions are given below: + \n + latest version of document:{text} + \n + previous version of document :{context} + \n + + Guidelines: If there are no substantial legal changes, respond with "No change." Avoid highlighting minor edits such as prepositions, punctuation + marks, or formatting adjustments. Focus on meaningful alterations that impact the legal context. + Do not make up the things. Give response in short. + \n + Change: + """ + + + custom_prompt_template_2 = """ + In the below provided context there is information to answer the question asked at the end. Read the context properly and answer the question asked to you at the end : + \n Context: \n + ########## + {context} + ########## + Explain the leave policy of Affine? + """ + + + llm = hf_pipeline + + + PROMPT = PromptTemplate(template = custom_prompt_template, input_variables = ["context"] #,"question"] + ,partial_variables={"text": query} + ) + chain_type_kwargs = {"prompt": PROMPT} + # retriever=chroma_db.as_retriever(type = "similarity", search_kwargs={"k":1}) + custom_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", + retriever=vectorstore_doc1.as_retriever(type = "similarity",search_kwargs={"k":3}), + verbose=True, + chain_type_kwargs=chain_type_kwargs, + return_source_documents = True) + + try: + response = custom_qa({"query": query}) + st.write("response") + st.write(response) + # st.write(response['source_documents']) + result = response['result'] + # st.write("resulteeeeeeeeeeeee") + # st.write(result) + change = result.split("Change:")[-1] + # st.write("changeeeeeeeeeeeeeeeeeeeee") + # st.write(change) + if change not in ["None"," None","• Change: None","None.","Change: None","""• Change: None + + The provided text matches the context in the document without any differences."""] : + changes.append(change) + j = i.split("footer") + if isinstance(j, str): + highlight_1.append(j) + else: + highlight_1.extend(j) + changes_1.append(change) + ct+=1 + st.write(str(ct)+'. '+change) + st.write('\n') + #st.write('\n') + except: + continue + if ct==6: + break + + + + st.write('\n') + st.write('\n') + st.write("highlight_1") + st.write(highlight_1) + path_1= qa_model.pdf_highlight1(file_2,highlight_1) + qa_model.displayPDF(path_1) + st.write('\n') + st.write('\n') + diff --git a/process-flow-diagram.JPG b/process-flow-diagram.JPG new file mode 100644 index 00000000..7c3efb51 Binary files /dev/null and b/process-flow-diagram.JPG differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..ebf8164b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +intel-extension-for-pytorch +accelerate==0.23.0 +validators==0.22.0 +transformers==4.32.1 +sentencepiece +ipywidgets +pillow +pypdf +streamlit +pyngrok +backoff +streamlit-chat==0.0.2.2 +PyPDF2==3.0.1 +langchain==0.0.168 +chromadb==0.3.21 +tiktoken +openpyxl +prettytable==3.8.0 +Pymupdf diff --git a/saved_contract/MSA v2_1.pdf b/saved_contract/MSA v2_1.pdf new file mode 100644 index 00000000..5f546a86 Binary files /dev/null and b/saved_contract/MSA v2_1.pdf differ diff --git a/src/entity.py b/src/entity.py new file mode 100644 index 00000000..900aa114 --- /dev/null +++ b/src/entity.py @@ -0,0 +1,215 @@ +# Import necessary libraries and modules +from langchain.chains import RetrievalQA +import base64 +import os +import fitz +import json +from pathlib import Path +import ast +import streamlit as st +from langchain.chains import RetrievalQA +from langchain.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import Chroma +from langchain.prompts import PromptTemplate +import chromadb +from prettytable import PrettyTable as pt +from langchain.embeddings import HuggingFaceEmbeddings +from langchain import HuggingFacePipeline + + +class QAModel: + def __init__(self): + pass + + + def displayPDF(self,file): + """ + Displays a PDF file in the browser. + + Parameters: + file (str): The path to the PDF file. + + Returns: + None + + """ + # Opening file from file path + with open(file, "rb") as f: + base64_pdf = base64.b64encode(f.read()).decode('utf-8') + + # Embedding PDF in HTML + #pdf_display = F'' + pdf_display = F'