diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..286ecaf --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.log +__pycache__ +*.graphml +*.json +.env + +graph +kv_store +vdb \ No newline at end of file diff --git a/PathRAG/llm.py b/PathRAG/llm.py index 80b5580..601051b 100644 --- a/PathRAG/llm.py +++ b/PathRAG/llm.py @@ -35,8 +35,12 @@ logger, ) +from dotenv import load_dotenv + import sys +load_dotenv() + if sys.version_info < (3, 9): from typing import AsyncIterator else: @@ -55,8 +59,8 @@ async def openai_complete_if_cache( prompt, system_prompt=None, history_messages=[], - base_url="https://api.openai.com/v1", - api_key="", + base_url=os.getenv("BASE_URL"), + api_key=os.getenv("API_KEY"), **kwargs, ) -> str: if api_key: @@ -764,8 +768,8 @@ async def zhipu_embedding( async def openai_embedding( texts: list[str], model: str = "text-embedding-3-small", - base_url="https://api.openai.com/v1", - api_key="", + base_url=os.getenv("BASE_URL"), + api_key=os.getenv("API_KEY"), ) -> np.ndarray: if api_key: os.environ["OPENAI_API_KEY"] = api_key diff --git a/PathRAG/settings.py b/PathRAG/settings.py new file mode 100644 index 0000000..951290d --- /dev/null +++ b/PathRAG/settings.py @@ -0,0 +1,9 @@ +from pathlib import Path +from dotenv import load_dotenv +import os + +BASE_DIR = Path(__file__).resolve().parent.parent + +GRAPH_FILE_PATH = os.path.join(BASE_DIR,'graph') +KV_STORE_FILE_PATH = os.path.join(BASE_DIR,'kv_store') +VDB_FILE__PATH = os.path.join(BASE_DIR,'vdb') \ No newline at end of file diff --git a/PathRAG/storage.py b/PathRAG/storage.py index 1e02042..b7ba7e6 100644 --- a/PathRAG/storage.py +++ b/PathRAG/storage.py @@ -21,12 +21,15 @@ BaseVectorStorage, ) +from .settings import KV_STORE_FILE_PATH +from .settings import GRAPH_FILE_PATH +from .settings import VDB_FILE__PATH + @dataclass class JsonKVStorage(BaseKVStorage): def __post_init__(self): - working_dir = self.global_config["working_dir"] - self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") + self._file_name = os.path.join(KV_STORE_FILE_PATH, f"kv_store_{self.namespace}.json") self._data = load_json(self._file_name) or {} logger.info(f"Load KV {self.namespace} with {len(self._data)} data") @@ -68,9 +71,7 @@ class NanoVectorDBStorage(BaseVectorStorage): cosine_better_than_threshold: float = 0.2 def __post_init__(self): - self._client_file_name = os.path.join( - self.global_config["working_dir"], f"vdb_{self.namespace}.json" - ) + self._client_file_name = os.path.join(VDB_FILE__PATH, f"vdb_{self.namespace}.json") self._max_batch_size = self.global_config["embedding_batch_num"] self._client = NanoVectorDB( self.embedding_func.embedding_dim, storage_file=self._client_file_name @@ -242,9 +243,7 @@ def _get_edge_key(source: Any, target: Any) -> str: return fixed_graph def __post_init__(self): - self._graphml_xml_file = os.path.join( - self.global_config["working_dir"], f"graph_{self.namespace}.graphml" - ) + self._graphml_xml_file = os.path.join(GRAPH_FILE_PATH, f"graph_{self.namespace}.graphml") preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) if preloaded_graph is not None: logger.info( diff --git a/README.md b/README.md index e7ad841..ba675c8 100644 --- a/README.md +++ b/README.md @@ -2,41 +2,38 @@ The code for the paper **"PathRAG: Pruning Graph-based Retrieval Augmented Gener ## Install ```bash cd PathRAG -pip install -e . +pip install -e . # or pip install -r requirements.txt ``` -## Quick Start -* You can quickly experience this project in the `v1_test.py` file. -* Set OpenAI API key in environment if using OpenAI models: `api_key="sk-...".` in the `v1_test.py` and `llm.py` file -* Prepare your retrieval document "text.txt". -* Use the following Python snippet in the "v1_text.py" file to initialize PathRAG and perform queries. - -```python -import os -from PathRAG import PathRAG, QueryParam -from PathRAG.llm import gpt_4o_mini_complete -WORKING_DIR = "./your_working_dir" -api_key="your_api_key" -os.environ["OPENAI_API_KEY"] = api_key -base_url="https://api.openai.com/v1" -os.environ["OPENAI_API_BASE"]=base_url +## RUN the project +### Windows -if not os.path.exists(WORKING_DIR): - os.mkdir(WORKING_DIR) +```bash +python -m venv .venv # create virtual environment +.venv\Scripts\activate # activate the virtual environment +python v1_test.py # to run the project -rag = PathRAG( - working_dir=WORKING_DIR, - llm_model_func=gpt_4o_mini_complete, -) +# if it doesn't works properly then try reinstalling the packages using the above installation command +``` -data_file="./text.txt" -question="your_question" -with open(data_file) as f: - rag.insert(f.read()) +### Linux/Unix -print(rag.query(question, param=QueryParam(mode="hybrid"))) +```bash +python3 -m venv .venv # create virtual environment +Source .venv\bin\activate # activate the virtual environment +python3 v1_test.py # to run the project + +# if it doesn't works properly then try reinstalling the packages using the above installation command ``` + +## Quick Start +* You can quickly experience this project in the `v1_test.py` file. +* Rename `exampe.env` to `.env` +* Set OpenAI API key in `.env` file and the BASE URL. +* Prepare your retrieval document `text.txt`. You can modify this in the code in `v1_test.py`. +* The `v1_text.py` file is the entry point to initialize PathRAG and perform queries. + ## Parameter modification You can adjust the relevant parameters in the `base.py` and `operate.py` files. diff --git a/example.env b/example.env new file mode 100644 index 0000000..ee815bc --- /dev/null +++ b/example.env @@ -0,0 +1,2 @@ +API_KEY= +BASE_URL=https://api.openai.com/v1 \ No newline at end of file diff --git a/v1_test.py b/v1_test.py index 7489ce3..9251f09 100644 --- a/v1_test.py +++ b/v1_test.py @@ -1,13 +1,37 @@ import os from PathRAG import PathRAG, QueryParam from PathRAG.llm import gpt_4o_mini_complete +from pathlib import Path -WORKING_DIR = "" +from dotenv import load_dotenv -api_key="" +load_dotenv() + +WORKING_DIR = "./PathRAG" + +# Define storage paths +BASE_DIR = Path(__file__).resolve().parent +GRAPH_FILE_PATH = os.path.join(BASE_DIR, 'graph') +KV_STORE_FILE_PATH = os.path.join(BASE_DIR, 'kv_store') +VDB_FILE_PATH = os.path.join(BASE_DIR, 'vdb') + +# Ensure directories exist +for path in [GRAPH_FILE_PATH, KV_STORE_FILE_PATH, VDB_FILE_PATH]: + os.makedirs(path, exist_ok=True) + +# Ensure necessary JSON files exist +for file_name in ["kv_store_full_docs.json", "kv_store_text_chunks.json", "kv_store_llm_response_cache.json"]: + file_path = os.path.join(KV_STORE_FILE_PATH, file_name) + if not os.path.exists(file_path): + with open(file_path, "w", encoding="utf-8") as f: + f.write("{}") # Initialize with empty JSON object + +# Set up API keys +api_key = os.getenv("API_KEY") os.environ["OPENAI_API_KEY"] = api_key -base_url="https://api.openai.com/v1" -os.environ["OPENAI_API_BASE"]=base_url +base_url = os.getenv("BASE_URL") +os.environ["OPENAI_API_BASE"] = base_url + if not os.path.exists(WORKING_DIR): @@ -18,10 +42,16 @@ llm_model_func=gpt_4o_mini_complete, ) -data_file="" -question="" -with open(data_file) as f: - rag.insert(f.read()) +data_file="text.txt" +question="what is this document all about?" + +with open(data_file, "r", encoding="utf-8") as f: + file_content = f.read().strip() + +if not file_content: + raise ValueError("The input file is empty. Please provide valid content.") + +rag.insert(file_content) print(rag.query(question, param=QueryParam(mode="hybrid")))