|
3 | 3 | import uuid
|
4 | 4 | from datetime import datetime
|
5 | 5 | from streamlit_pdf_viewer import pdf_viewer
|
| 6 | +import PyPDF2 |
| 7 | +import pdfplumber |
| 8 | +import io |
6 | 9 |
|
7 | 10 | st.set_page_config(page_title="Ollama Chatbot", layout="wide")
|
8 | 11 | st.title("Ollama Chatbot")
|
9 | 12 |
|
| 13 | +# PDF Processing functions |
| 14 | +def extract_text_from_pdf(pdf_file) -> str: |
| 15 | + """Extract text from uploaded PDF file""" |
| 16 | + try: |
| 17 | + # Try with pdfplumber first (usually better for text extraction) |
| 18 | + pdf_bytes = pdf_file.getvalue() |
| 19 | + |
| 20 | + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: |
| 21 | + text = "" |
| 22 | + for page in pdf.pages: |
| 23 | + page_text = page.extract_text() |
| 24 | + if page_text: |
| 25 | + text += page_text + "\n" |
| 26 | + |
| 27 | + if text.strip(): |
| 28 | + return text |
| 29 | + |
| 30 | + # Fallback to PyPDF2 if pdfplumber doesn't work |
| 31 | + pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes)) |
| 32 | + text = "" |
| 33 | + for page in pdf_reader.pages: |
| 34 | + text += page.extract_text() + "\n" |
| 35 | + |
| 36 | + return text |
| 37 | + except Exception as e: |
| 38 | + st.error(f"Error extracting text from PDF: {e}") |
| 39 | + return "" |
| 40 | + |
10 | 41 | # Function to get available Ollama models
|
11 | 42 | def get_ollama_models():
|
12 | 43 | try:
|
@@ -51,7 +82,8 @@ def create_new_chat():
|
51 | 82 | "title": "New Document Chat",
|
52 | 83 | "document_uploaded": False,
|
53 | 84 | "document_name": None,
|
54 |
| - "document_content": None |
| 85 | + "document_content": None, |
| 86 | + "document_text": "" |
55 | 87 | }
|
56 | 88 | st.session_state.current_chat_id = chat_id
|
57 | 89 | return chat_id
|
@@ -225,15 +257,27 @@ def format_chat_time(created_at):
|
225 | 257 | )
|
226 | 258 |
|
227 | 259 | if uploaded_file is not None:
|
228 |
| - # Update chat with document info |
229 |
| - st.session_state.chats[st.session_state.current_chat_id]["document_uploaded"] = True |
230 |
| - st.session_state.chats[st.session_state.current_chat_id]["document_name"] = uploaded_file.name |
231 |
| - st.session_state.chats[st.session_state.current_chat_id]["document_content"] = uploaded_file.getvalue() |
232 |
| - st.session_state.chats[st.session_state.current_chat_id]["title"] = f"📄 {uploaded_file.name}" |
| 260 | + # Extract text from the PDF |
| 261 | + with st.spinner("Processing PDF and extracting text..."): |
| 262 | + extracted_text = extract_text_from_pdf(uploaded_file) |
233 | 263 |
|
234 |
| - st.success(f"✅ Document '{uploaded_file.name}' uploaded successfully!") |
235 |
| - st.info("💬 You can now start asking questions about your document below.") |
236 |
| - st.rerun() |
| 264 | + if extracted_text: |
| 265 | + # Update chat with document info |
| 266 | + st.session_state.chats[st.session_state.current_chat_id]["document_uploaded"] = True |
| 267 | + st.session_state.chats[st.session_state.current_chat_id]["document_name"] = uploaded_file.name |
| 268 | + st.session_state.chats[st.session_state.current_chat_id]["document_content"] = uploaded_file.getvalue() |
| 269 | + st.session_state.chats[st.session_state.current_chat_id]["document_text"] = extracted_text |
| 270 | + st.session_state.chats[st.session_state.current_chat_id]["title"] = f"📄 {uploaded_file.name}" |
| 271 | + |
| 272 | + st.success(f"✅ Document '{uploaded_file.name}' uploaded and processed successfully!") |
| 273 | + st.info("💬 You can now start asking questions about your document below.") |
| 274 | + |
| 275 | + # Show extracted text info |
| 276 | + word_count = len(extracted_text.split()) |
| 277 | + st.info(f"📊 Extracted {word_count:,} words from the document") |
| 278 | + st.rerun() |
| 279 | + else: |
| 280 | + st.error("❌ Could not extract text from the PDF. Please ensure it's a text-based PDF document.") |
237 | 281 |
|
238 | 282 | else:
|
239 | 283 | # Show current document info if uploaded
|
@@ -279,8 +323,29 @@ def format_chat_time(created_at):
|
279 | 323 | try:
|
280 | 324 | # Display assistant response with streaming
|
281 | 325 | with st.chat_message("assistant"):
|
| 326 | + # Get document context |
| 327 | + current_chat = st.session_state.chats.get(st.session_state.current_chat_id, {}) |
| 328 | + document_text = current_chat.get('document_text', '') |
| 329 | + |
282 | 330 | # Prepare messages for Ollama API (list of dicts)
|
283 | 331 | current_conversation = []
|
| 332 | + |
| 333 | + # Add document context as system message if available |
| 334 | + if document_text: |
| 335 | + system_prompt = f"""You are a helpful assistant that answers questions about documents. You have been provided with the following document content: |
| 336 | +
|
| 337 | +--- DOCUMENT CONTENT --- |
| 338 | +{document_text} |
| 339 | +--- END DOCUMENT CONTENT --- |
| 340 | +
|
| 341 | +Please answer questions based on this document content. If a question cannot be answered from the document, please say so clearly.""" |
| 342 | + |
| 343 | + current_conversation.append({ |
| 344 | + 'role': 'system', |
| 345 | + 'content': system_prompt |
| 346 | + }) |
| 347 | + |
| 348 | + # Add conversation history |
284 | 349 | for msg in get_current_messages():
|
285 | 350 | current_conversation.append({'role': msg['role'], 'content': msg['content']})
|
286 | 351 |
|
|
0 commit comments