Skip to content

Commit 9d88b06

Browse files
authored
Merge pull request #872 from oracle-devrel/anshSummarizeUpdate
created folder structure and some files for document summarization
2 parents 6f333c7 + 1c6fd78 commit 9d88b06

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Document Summarization Using Oracle Generative AI
2+
3+
Text summarization, a core NLP task, unlocks the ability to distill lengthy content into concise, informative summaries. Large Language Models (LLMs) serve as powerful tools for summarizing a wide array of texts, including news articles, research papers, and technical documents. However, summarizing large documents comes with its own set of challenges, necessitating the application of specialized summarization strategies to indexed content.
4+
5+
In this article, we'll delve into the creation of a powerful document summarization solution leveraging Oracle Generative AI. Through the integration of Oracle Gen AI's advanced capabilities with cutting-edge technologies such as langchain. This codebase empowers users to effortlessly summarize extensive documents, harnessing the power of Oracle Generative AI Service.
6+
7+
8+
# When to use this asset?
9+
10+
See the README document in the /files folder.
11+
12+
# How to use this asset?
13+
14+
See the README document in the /files folder.
15+
16+
# License
17+
18+
Copyright (c) 2024 Oracle and/or its affiliates.
19+
20+
Licensed under the Universal Permissive License (UPL), Version 1.0.
21+
22+
See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import streamlit as st
2+
import os
3+
from langchain.document_loaders import PyPDFLoader
4+
from langchain.prompts import PromptTemplate
5+
from langchain.text_splitter import RecursiveCharacterTextSplitter
6+
from langchain.chains.summarize import load_summarize_chain
7+
from langchain.chains import LLMChain
8+
from langchain_community.llms import OCIGenAI
9+
# from genai_langchain_integration.langchain_oci import OCIGenAI
10+
from pypdf import PdfReader
11+
from io import BytesIO
12+
from typing import Any, Dict, List
13+
import re
14+
from langchain.docstore.document import Document
15+
16+
17+
18+
@st.cache_data
19+
def parse_pdf(file: BytesIO) -> List[str]:
20+
pdf = PdfReader(file)
21+
output = []
22+
for page in pdf.pages:
23+
text = page.extract_text()
24+
# Merge hyphenated words
25+
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
26+
# Fix newlines in the middle of sentences
27+
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
28+
# Remove multiple newlines
29+
text = re.sub(r"\n\s*\n", "\n\n", text)
30+
output.append(text)
31+
return output
32+
33+
@st.cache_data
34+
def text_to_docs(text: str,chunk_size,chunk_overlap) -> List[Document]:
35+
"""Converts a string or list of strings to a list of Documents
36+
with metadata."""
37+
print("I am here Ansh")
38+
print(chunk_size)
39+
print(chunk_overlap)
40+
if isinstance(text, str):
41+
# Take a single string as one page
42+
text = [text]
43+
page_docs = [Document(page_content=page) for page in text]
44+
45+
# Add page numbers as metadata
46+
for i, doc in enumerate(page_docs):
47+
doc.metadata["page"] = i + 1
48+
49+
# Split pages into chunks
50+
doc_chunks = []
51+
52+
for doc in page_docs:
53+
text_splitter = RecursiveCharacterTextSplitter(
54+
chunk_size=chunk_size,
55+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
56+
chunk_overlap=chunk_overlap,
57+
)
58+
chunks = text_splitter.split_text(doc.page_content)
59+
for i, chunk in enumerate(chunks):
60+
doc = Document(
61+
page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
62+
)
63+
# Add sources a metadata
64+
doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
65+
doc_chunks.append(doc)
66+
return doc_chunks
67+
68+
69+
def custom_summary(docs, llm, custom_prompt, chain_type, num_summaries):
70+
print("I am inside custom summary")
71+
custom_prompt = custom_prompt + """:\n {text}"""
72+
print("custom Prompt is ------>")
73+
print(custom_prompt)
74+
COMBINE_PROMPT = PromptTemplate(template=custom_prompt, input_variables = ["text"])
75+
print("combine Prompt is ------>")
76+
print(COMBINE_PROMPT)
77+
MAP_PROMPT = PromptTemplate(template="Summarize:\n{text}", input_variables=["text"])
78+
print("MAP_PROMPT Prompt is ------>")
79+
print(MAP_PROMPT)
80+
if chain_type == "map_reduce":
81+
chain = load_summarize_chain(llm,chain_type=chain_type,
82+
map_prompt=MAP_PROMPT,
83+
combine_prompt=COMBINE_PROMPT)
84+
else:
85+
chain = load_summarize_chain(llm,chain_type=chain_type)
86+
print("Chain is --->")
87+
print(chain)
88+
summaries = []
89+
for i in range(num_summaries):
90+
summary_output = chain({"input_documents": docs}, return_only_outputs=True)["output_text"]
91+
print("Summaries------------->")
92+
print(summary_output)
93+
summaries.append(summary_output)
94+
95+
return summaries
96+
97+
98+
def main():
99+
st.set_page_config(layout="wide")
100+
hide_streamlit_style = """
101+
<style>
102+
[data-testid="stToolbar"] {visibility: hidden !important;}
103+
footer {visibility: hidden !important;}
104+
</style>
105+
"""
106+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
107+
st.title("Document Summarization App")
108+
109+
llm = st.sidebar.selectbox("LLM",["OracleGenAI","Other (other source in the future)"])
110+
chain_type = st.sidebar.selectbox("Chain Type", ["map_reduce", "stuff", "refine"])
111+
chunk_size = st.sidebar.slider("Chunk Size", min_value=20, max_value = 10000,
112+
step=10, value=4000)
113+
chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=5, max_value = 5000,
114+
step=10, value=200)
115+
user_prompt = st.text_input("Enter the document summary prompt", value= "Compose a concise and brief summary of this text. ")
116+
temperature = st.sidebar.number_input("Set the GenAI Temperature",
117+
min_value = 0.0,
118+
max_value=1.0,
119+
step=0.1,
120+
value=0.5)
121+
122+
opt = "Upload-own-file"
123+
pages = None
124+
if opt == "Upload-own-file":
125+
uploaded_file = st.file_uploader(
126+
"**Upload a Pdf file :**",
127+
type=["pdf"],
128+
)
129+
if uploaded_file:
130+
if uploaded_file.name.endswith(".txt"):
131+
doc = parse_txt(uploaded_file)
132+
elif uploaded_file.name.endswith(".pdf"):
133+
doc = parse_pdf(uploaded_file)
134+
pages = text_to_docs(doc, chunk_size, chunk_overlap)
135+
print("Pages are here")
136+
print(pages)
137+
138+
139+
page_holder = st.empty()
140+
if pages:
141+
print("Inside if PAges")
142+
st.write("PDF loaded successfully")
143+
with page_holder.expander("File Content", expanded=False):
144+
pages
145+
146+
147+
148+
# llm = OCIGenAI(
149+
# model_id="cohere.command",
150+
# service_endpoint="https://generativeai.aiservice.us-chicago-1.oci.oraclecloud.com",
151+
# compartment_id = "ocid1.tenancy.oc1..aaaaaaaa5hwtrus75rauufcfvtnjnz3mc4xm2bzibbigva2bw4ne7ezkvzha",
152+
# temperature=temperature
153+
# )
154+
llm = OCIGenAI(
155+
model_id="cohere.command",
156+
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
157+
compartment_id = "ocid1.compartment.oc1..aaaaaaaa7ggqkd4ptkeb7ugk6ipsl3gqjofhkr6yacluwj4fitf2ufrdm65q",
158+
)
159+
160+
if st.button("Summarize"):
161+
with st.spinner('Summarizing....'):
162+
result = custom_summary(pages, llm, user_prompt, chain_type, 1)
163+
st.write("Summary:")
164+
for summary in result:
165+
st.write(summary)
166+
else:
167+
st.warning("No file found. Upload a file to summarize!")
168+
169+
if __name__=="__main__":
170+
main()

0 commit comments

Comments
 (0)