|
| 1 | += Neo4j GraphRAG Python Package |
| 2 | +include::_graphacademy_llm.adoc[] |
| 3 | +:slug: graphrag-python |
| 4 | +:author: |
| 5 | +:category: genai-ecosystem |
| 6 | +:tags: graphrag, knowledgegraph, embedding, vectorsearch, neo4j, python |
| 7 | +:neo4j-versions: 5.23+ |
| 8 | +:page-pagination: |
| 9 | +:page-product: neo4j |
| 10 | + |
| 11 | + |
| 12 | +The Neo4j GraphRAG package is a comprehensive Python library that allows building GenAI applications. |
| 13 | +It supports knowledge graph creation through a pipeline that extracts entities from unstructured text, generates embeddings, and creates a graph in Neo4j. |
| 14 | +The package also provides a number of retrievers, for graph search, vector search and integration with vector databases. |
| 15 | + |
| 16 | +== Functionality Includes |
| 17 | + |
| 18 | +* Knowledge Graph Construction Pipeline |
| 19 | +* Neo4j Vector Retriever |
| 20 | +* Vector Cypher Retriever |
| 21 | +* Vector Database Retriever |
| 22 | + |
| 23 | +image::https://cdn.graphacademy.neo4j.com/assets/img/courses/banners/genai-workshop-graphrag.png[width=800,link="https://graphacademy.neo4j.com/courses/genai-workshop-graphrag/"] |
| 24 | + |
| 25 | +== Usage - Examples for a BioMedical Knowledge Graph |
| 26 | + |
| 27 | +First Knowlege Graph Construction using the SimpleKGPipeline |
| 28 | + |
| 29 | +image::https://dist.neo4j.com/wp-content/uploads/20241015075828/simplekgpipeline-1.png[] |
| 30 | + |
| 31 | +Setup of Neo4j connection, schema and foundation models (LLM, Eebeddings) and extraction prompt template. |
| 32 | + |
| 33 | +[source,python] |
| 34 | +---- |
| 35 | +# Neo4j Driver |
| 36 | +import neo4j |
| 37 | +
|
| 38 | +neo4j_driver = neo4j.GraphDatabase.driver(NEO4J_URI, |
| 39 | + auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) |
| 40 | +
|
| 41 | +# LLM and Embedding Model |
| 42 | +from neo4j_graphrag.llm import OpenAILLM |
| 43 | +from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings |
| 44 | +
|
| 45 | +llm=OpenAILLM( |
| 46 | + model_name="gpt-4o-mini", |
| 47 | + model_params={ |
| 48 | + "response_format": {"type": "json_object"}, # use json_object formatting for best results |
| 49 | + "temperature": 0 # turning temperature down for more deterministic results |
| 50 | + } |
| 51 | +) |
| 52 | +
|
| 53 | +# Graph Schema Setup |
| 54 | +basic_node_labels = ["Object", "Entity", "Group", "Person", "Organization", "Place"] |
| 55 | +
|
| 56 | +academic_node_labels = ["ArticleOrPaper", "PublicationOrJournal"] |
| 57 | +
|
| 58 | +medical_node_labels = ["Anatomy", "BiologicalProcess", "Cell", "CellularComponent", |
| 59 | + "CellType", "Condition", "Disease", "Drug", |
| 60 | + "EffectOrPhenotype", "Exposure", "GeneOrProtein", "Molecule", |
| 61 | + "MolecularFunction", "Pathway"] |
| 62 | +
|
| 63 | +node_labels = basic_node_labels + academic_node_labels + medical_node_labels |
| 64 | +
|
| 65 | +# define relationship types |
| 66 | +rel_types = ["ACTIVATES", "AFFECTS", "ASSESSES", "ASSOCIATED_WITH", "AUTHORED", |
| 67 | + "BIOMARKER_FOR", …] |
| 68 | +
|
| 69 | +#create text embedder |
| 70 | +embedder = OpenAIEmbeddings() |
| 71 | +
|
| 72 | +# define prompt template |
| 73 | +prompt_template = ''' |
| 74 | +You are a medical researcher tasks with extracting information from papers |
| 75 | +and structuring it in a property graph to inform further medical and research Q&A. |
| 76 | +
|
| 77 | +Extract the entities (nodes) and specify their type from the following Input text. |
| 78 | +Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. |
| 79 | +
|
| 80 | +
|
| 81 | +Return result as JSON using the following format: |
| 82 | +{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}], |
| 83 | + "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }} |
| 84 | +
|
| 85 | +... |
| 86 | +
|
| 87 | +Use only fhe following nodes and relationships: |
| 88 | +{schema} |
| 89 | +
|
| 90 | +Assign a unique ID (string) to each node, and reuse it to define relationships. |
| 91 | +Do respect the source and target node types for relationship and the relationship direction. |
| 92 | +
|
| 93 | +Do not return any additional information other than the JSON in it. |
| 94 | +
|
| 95 | +Examples: |
| 96 | +{examples} |
| 97 | +
|
| 98 | +Input text: |
| 99 | +
|
| 100 | +{text} |
| 101 | +''' |
| 102 | +---- |
| 103 | + |
| 104 | +Knowledge Graph Pipeline Setup and Execution with example PDFs |
| 105 | + |
| 106 | +[source,python] |
| 107 | +---- |
| 108 | +# Knowledge Graph Builder |
| 109 | +from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter |
| 110 | +from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline |
| 111 | +
|
| 112 | +kg_builder_pdf = SimpleKGPipeline( |
| 113 | + llm=ex_llm, |
| 114 | + driver=driver, |
| 115 | + text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100), |
| 116 | + embedder=embedder, |
| 117 | + entities=node_labels, |
| 118 | + relations=rel_types, |
| 119 | + prompt_template=prompt_template, |
| 120 | + from_pdf=True |
| 121 | +) |
| 122 | +
|
| 123 | +pdf_file_paths = ['truncated-pdfs/biomolecules-11-00928-v2-trunc.pdf', |
| 124 | + 'truncated-pdfs/GAP-between-patients-and-clinicians_2023_Best-Practice-trunc.pdf', |
| 125 | + 'truncated-pdfs/pgpm-13-39-trunc.pdf'] |
| 126 | +
|
| 127 | +for path in pdf_file_paths: |
| 128 | + print(f"Processing : {path}") |
| 129 | + pdf_result = await kg_builder_pdf.run_async(file_path=path) |
| 130 | + print(f"Result: {pdf_result}") |
| 131 | +---- |
| 132 | + |
| 133 | +image::https://dist.neo4j.com/wp-content/uploads/20241015075652/document-chunk-entity.png[width=800] |
| 134 | + |
| 135 | +Then running the GraphRAG Search with the VectorCypher Retriever. |
| 136 | + |
| 137 | +[source,python] |
| 138 | +---- |
| 139 | +from neo4j_graphrag.indexes import create_vector_index |
| 140 | +
|
| 141 | +create_vector_index(driver, name="text_embeddings", label="Chunk", |
| 142 | + embedding_property="embedding", dimensions=1536, similarity_fn="cosine") |
| 143 | +
|
| 144 | +# Vector Retriever |
| 145 | +from neo4j_graphrag.retrievers import VectorRetriever |
| 146 | +
|
| 147 | +vector_retriever = VectorRetriever( |
| 148 | + driver, |
| 149 | + index_name="text_embeddings", |
| 150 | + embedder=embedder, |
| 151 | + return_properties=["text"], |
| 152 | +) |
| 153 | +
|
| 154 | +# GraphRAG Vector Cypher Retriever |
| 155 | +from neo4j_graphrag.retrievers import VectorCypherRetriever |
| 156 | +
|
| 157 | +graph_retriever = VectorCypherRetriever( |
| 158 | + driver, |
| 159 | + index_name="text_embeddings", |
| 160 | + embedder=embedder, |
| 161 | + retrieval_query=""" |
| 162 | +//1) Go out 2-3 hops in the entity graph and get relationships |
| 163 | +WITH node AS chunk |
| 164 | +MATCH (chunk)<-[:FROM_CHUNK]-(entity)-[relList:!FROM_CHUNK]-{1,2}(nb) |
| 165 | +UNWIND relList AS rel |
| 166 | +
|
| 167 | +//2) collect relationships and text chunks |
| 168 | +WITH collect(DISTINCT chunk) AS chunks, collect(DISTINCT rel) AS rels |
| 169 | +
|
| 170 | +//3) format and return context |
| 171 | +RETURN apoc.text.join([c in chunks | c.text], '\n') + |
| 172 | + apoc.text.join([r in rels | |
| 173 | + startNode(r).name+' - '+type(r)+' '+r.details+' -> '+endNode(r).name], |
| 174 | + '\n') AS info |
| 175 | +""" |
| 176 | +) |
| 177 | +
|
| 178 | +llm = LLM(model_name="gpt-4o", model_params={"temperature": 0.0}) |
| 179 | +
|
| 180 | +rag_template = RagTemplate(template='''Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned. |
| 181 | +
|
| 182 | +# Question: |
| 183 | +{query_text} |
| 184 | + |
| 185 | +# Context: |
| 186 | +{context} |
| 187 | +
|
| 188 | +# Answer: |
| 189 | +''', expected_inputs=['query_text', 'context']) |
| 190 | +
|
| 191 | +vector_rag = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template) |
| 192 | +
|
| 193 | +graph_rag = GraphRAG(llm=llm, retriever=graph_retriever, prompt_template=rag_template) |
| 194 | +
|
| 195 | +q = "Can you summarize systemic lupus erythematosus (SLE)? including common effects, biomarkers, and treatments? Provide in detailed list format." |
| 196 | +
|
| 197 | +vector_rag.search(q, retriever_config={'top_k':5}).answer |
| 198 | +graph_rag.search(q, retriever_config={'top_k':5}).answer |
| 199 | +---- |
| 200 | + |
| 201 | +image::https://dist.neo4j.com/wp-content/uploads/20241128072906/Bildschirmfoto-2024-11-19-um-17.31.45.png[] |
| 202 | + |
| 203 | +== Documentation |
| 204 | + |
| 205 | +[cols="1,4"] |
| 206 | +|=== |
| 207 | +| icon:book[] Documentation | https://neo4j.com/docs/neo4j-graphrag-python/current/ |
| 208 | +| icon:book[] Guides | https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_rag.html[RAG & GraphRAG^] |
| 209 | +| icon:book[] Guides | https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_kg_builder.html[Guide Knowledge Graph Builder^] |
| 210 | + |
| 211 | +|=== |
| 212 | + |
| 213 | +== Relevant Links |
| 214 | + |
| 215 | +[cols="1,4"] |
| 216 | +|=== |
| 217 | +| icon:user[] Authors | Neo4j Engineering |
| 218 | +| icon:github[] Repository | https://github.com/neo4j/neo4j-graphrag-python[GitHub] |
| 219 | +| icon:github[] Issues | https://github.com/neo4j/neo4j-graphrag-python/issues |
| 220 | +|=== |
| 221 | + |
| 222 | + |
| 223 | +== Videos & Tutorials |
| 224 | + |
| 225 | +++++ |
| 226 | +<iframe width="560" height="315" src="https://www.youtube.com/embed/hDJlruy60AM?si=TEFW1mj91qrQnaeX" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe> |
| 227 | +++++ |
| 228 | + |
| 229 | +++++ |
| 230 | +<iframe width="560" height="315" src="https://www.youtube.com/embed/OALrsghrP_I?si=Yw08z6fiCp3y_L0j" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe> |
| 231 | +++++ |
| 232 | + |
| 233 | +== Highlighted Articles |
| 234 | + |
| 235 | +* https://neo4j.com/blog/graphrag-python-package/[GraphRAG Python Package: Accelerating GenAI With Knowledge Graphs^] |
| 236 | +* https://neo4j.com/developer-blog/get-started-graphrag-python-package/[Getting Started With the Neo4j GraphRAG Python Package^] |
| 237 | +* https://neo4j.com/developer-blog/graph-traversal-graphrag-python-package/[Vector Search With Graph Traversal the Using Neo4j GraphRAG Package^] |
| 238 | +* https://neo4j.com/developer-blog/hybrid-retrieval-graphrag-python-package/[Hybrid Retrieval Using the Neo4j GraphRAG Package for Python^] |
| 239 | +* https://neo4j.com/developer-blog/enhancing-hybrid-retrieval-graphrag-python-package/[Enhancing Hybrid Retrieval With Graph Traversal: Neo4j GraphRAG Python^] |
0 commit comments