Skip to content

Commit 363d177

Browse files
committed
+ basic document chat
1 parent d2dbbe8 commit 363d177

File tree

4 files changed

+81
-12
lines changed

4 files changed

+81
-12
lines changed

app.py

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,41 @@
33
import uuid
44
from datetime import datetime
55
from streamlit_pdf_viewer import pdf_viewer
6+
import PyPDF2
7+
import pdfplumber
8+
import io
69

710
st.set_page_config(page_title="Ollama Chatbot", layout="wide")
811
st.title("Ollama Chatbot")
912

13+
# PDF Processing functions
14+
def extract_text_from_pdf(pdf_file) -> str:
15+
"""Extract text from uploaded PDF file"""
16+
try:
17+
# Try with pdfplumber first (usually better for text extraction)
18+
pdf_bytes = pdf_file.getvalue()
19+
20+
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
21+
text = ""
22+
for page in pdf.pages:
23+
page_text = page.extract_text()
24+
if page_text:
25+
text += page_text + "\n"
26+
27+
if text.strip():
28+
return text
29+
30+
# Fallback to PyPDF2 if pdfplumber doesn't work
31+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
32+
text = ""
33+
for page in pdf_reader.pages:
34+
text += page.extract_text() + "\n"
35+
36+
return text
37+
except Exception as e:
38+
st.error(f"Error extracting text from PDF: {e}")
39+
return ""
40+
1041
# Function to get available Ollama models
1142
def get_ollama_models():
1243
try:
@@ -51,7 +82,8 @@ def create_new_chat():
5182
"title": "New Document Chat",
5283
"document_uploaded": False,
5384
"document_name": None,
54-
"document_content": None
85+
"document_content": None,
86+
"document_text": ""
5587
}
5688
st.session_state.current_chat_id = chat_id
5789
return chat_id
@@ -225,15 +257,27 @@ def format_chat_time(created_at):
225257
)
226258

227259
if uploaded_file is not None:
228-
# Update chat with document info
229-
st.session_state.chats[st.session_state.current_chat_id]["document_uploaded"] = True
230-
st.session_state.chats[st.session_state.current_chat_id]["document_name"] = uploaded_file.name
231-
st.session_state.chats[st.session_state.current_chat_id]["document_content"] = uploaded_file.getvalue()
232-
st.session_state.chats[st.session_state.current_chat_id]["title"] = f"📄 {uploaded_file.name}"
260+
# Extract text from the PDF
261+
with st.spinner("Processing PDF and extracting text..."):
262+
extracted_text = extract_text_from_pdf(uploaded_file)
233263

234-
st.success(f"✅ Document '{uploaded_file.name}' uploaded successfully!")
235-
st.info("💬 You can now start asking questions about your document below.")
236-
st.rerun()
264+
if extracted_text:
265+
# Update chat with document info
266+
st.session_state.chats[st.session_state.current_chat_id]["document_uploaded"] = True
267+
st.session_state.chats[st.session_state.current_chat_id]["document_name"] = uploaded_file.name
268+
st.session_state.chats[st.session_state.current_chat_id]["document_content"] = uploaded_file.getvalue()
269+
st.session_state.chats[st.session_state.current_chat_id]["document_text"] = extracted_text
270+
st.session_state.chats[st.session_state.current_chat_id]["title"] = f"📄 {uploaded_file.name}"
271+
272+
st.success(f"✅ Document '{uploaded_file.name}' uploaded and processed successfully!")
273+
st.info("💬 You can now start asking questions about your document below.")
274+
275+
# Show extracted text info
276+
word_count = len(extracted_text.split())
277+
st.info(f"📊 Extracted {word_count:,} words from the document")
278+
st.rerun()
279+
else:
280+
st.error("❌ Could not extract text from the PDF. Please ensure it's a text-based PDF document.")
237281

238282
else:
239283
# Show current document info if uploaded
@@ -279,8 +323,29 @@ def format_chat_time(created_at):
279323
try:
280324
# Display assistant response with streaming
281325
with st.chat_message("assistant"):
326+
# Get document context
327+
current_chat = st.session_state.chats.get(st.session_state.current_chat_id, {})
328+
document_text = current_chat.get('document_text', '')
329+
282330
# Prepare messages for Ollama API (list of dicts)
283331
current_conversation = []
332+
333+
# Add document context as system message if available
334+
if document_text:
335+
system_prompt = f"""You are a helpful assistant that answers questions about documents. You have been provided with the following document content:
336+
337+
--- DOCUMENT CONTENT ---
338+
{document_text}
339+
--- END DOCUMENT CONTENT ---
340+
341+
Please answer questions based on this document content. If a question cannot be answered from the document, please say so clearly."""
342+
343+
current_conversation.append({
344+
'role': 'system',
345+
'content': system_prompt
346+
})
347+
348+
# Add conversation history
284349
for msg in get_current_messages():
285350
current_conversation.append({'role': msg['role'], 'content': msg['content']})
286351

environment.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ dependencies:
1010
# You can add other conda packages here
1111
- pip:
1212
- ollama
13-
- streamlit-pdf-viewer
13+
- streamlit-pdf-viewer
14+
- PyPDF2
15+
- pdfplumber
1416
- -e . # Installs the ragnarok package in editable mode
1517
# If you have packages not on conda, list them under pip:
1618
# - package_from_pip

ragnarok/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1+

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
streamlit
22
ollama
3-
streamlit-pdf-viewer
3+
streamlit-pdf-viewer
4+
PyPDF2
5+
pdfplumber

0 commit comments

Comments
 (0)