diff --git a/README.md b/README.md index e7d86d0..fd659e4 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,77 @@ A Python package for building RAG (Retrieval-Augmented Generation) applications using PDFs, ChromaDB, and Ollama. -## Installation +## Project Structure + +``` +. +├── pdf_rag +│ ├── document_processor.py +│ ├── __init__.py +│ ├── llm_interface.py +│ ├── main.py +│ └── vector_store.py +├── README.md +├── requirements.txt +├── setup.py +├── test_package.py +└── test.py -```bash -pip install -e . +2 directories, 10 files ``` -## Usage +## Installation + +1. **Create and activate a virtual environment:** + + ```bash + # Create a virtual environment + python -m venv venv + + # Activate the virtual environment + # On Windows + venv\Scripts\activate + # On Unix or MacOS + source venv/bin/activate + ``` + +2. **Install the package:** + + ```bash + pip install -e . + ``` + +3. **Install Ollama on Linux:** + + Follow the steps below to install Ollama on a Linux system. + + ```bash + # Download the Ollama installer + curl -fsSL https://ollama.com/install.sh | sh + + # Verify the installation + ollama --version + ``` + +4. **Download models in Ollama:** + + To download specific models such as `llama3` and `deepseek-R1`, use the following commands: + + ```bash + # Download the llama3 model + ollama pull llama3 + + # Download the deepseek-R1 model + ollama pull deepseek-R1 + ``` + +## base Usage ```python from pdf_rag import PDFRAGApplication # Initialize the application -rag = PDFRAGApplication() +rag = PDFRAGApplication(model_name= "deepseek-r1") # Load a PDF rag.load_pdf("your_document.pdf") @@ -23,3 +81,31 @@ rag.load_pdf("your_document.pdf") response = rag.query("What is this document about?") print(response) ``` + +## Testing + +Run the `test.py` script to see how the module works with ChromaDB: + +```python +import chromadb +chroma_client = chromadb.Client() + +# switch `create_collection` to `get_or_create_collection` to avoid creating a new collection every time +collection = chroma_client.get_or_create_collection(name="my_collection") + +# switch `add` to `upsert` to avoid adding the same documents every time +collection.upsert( + documents=[ + "This is a document about pineapple", + "This is a document about oranges" + ], + ids=["id1", "id2"] +) + +results = collection.query( + query_texts=["This is a query document about hawaii"], # Chroma will embed this for you + n_results=2 # how many results to return +) + +print(results) +``` diff --git a/pdf_rag/__pycache__/__init__.cpython-310.pyc b/pdf_rag/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 9256df7..0000000 Binary files a/pdf_rag/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/pdf_rag/__pycache__/document_processor.cpython-310.pyc b/pdf_rag/__pycache__/document_processor.cpython-310.pyc deleted file mode 100644 index 3d2d0a1..0000000 Binary files a/pdf_rag/__pycache__/document_processor.cpython-310.pyc and /dev/null differ diff --git a/pdf_rag/__pycache__/llm_interface.cpython-310.pyc b/pdf_rag/__pycache__/llm_interface.cpython-310.pyc deleted file mode 100644 index 42392db..0000000 Binary files a/pdf_rag/__pycache__/llm_interface.cpython-310.pyc and /dev/null differ diff --git a/pdf_rag/__pycache__/main.cpython-310.pyc b/pdf_rag/__pycache__/main.cpython-310.pyc deleted file mode 100644 index c5d6ceb..0000000 Binary files a/pdf_rag/__pycache__/main.cpython-310.pyc and /dev/null differ diff --git a/pdf_rag/__pycache__/vector_store.cpython-310.pyc b/pdf_rag/__pycache__/vector_store.cpython-310.pyc deleted file mode 100644 index db37c90..0000000 Binary files a/pdf_rag/__pycache__/vector_store.cpython-310.pyc and /dev/null differ diff --git a/setup.py b/setup.py index a16de45..c3a1265 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,6 @@ "chromadb", "requests" ], - author="Your Name", - author_email="your.email@example.com", description="A RAG application for PDF documents using ChromaDB and Ollama", long_description=open("README.md").read(), long_description_content_type="text/markdown", diff --git a/test_package.py b/test_package.py index 47f0811..0d99f9e 100644 --- a/test_package.py +++ b/test_package.py @@ -1,12 +1,15 @@ -## Usage from pdf_rag import PDFRAGApplication -# Initialize the application -rag = PDFRAGApplication( model_name="llama3") +def test_pdf_rag(): + # Initialize the application + rag = PDFRAGApplication(model_name= "deepseek-r1") -# Load a PDF -rag.load_pdf("resume.pdf") + # Load a PDF + rag.load_pdf("your_document.pdf") -# Query the system -response = rag.query("What is this document about?") -print(response) + # Query the system + response = rag.query("What is this document about?") + print(response) + +if __name__ == "__main__": + test_pdf_rag() \ No newline at end of file