Skip to content

Commit 9ff034f

Browse files
Merge pull request #109 from neo4j-labs/update_model_relationship
Update model, Create relation between chunks of each document as NEXT…
2 parents 00d6c04 + 5b19d9e commit 9ff034f

File tree

5 files changed

+75
-32
lines changed

5 files changed

+75
-32
lines changed

backend/score.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,15 @@ async def get_source_list(uri:str,
144144
return result
145145

146146
@app.post("/update_similarity_graph")
147-
async def update_similarity_graph():
147+
async def update_similarity_graph(uri=Form(None),
148+
userName=Form(None),
149+
password=Form(None),
150+
database=Form(None)):
148151
"""
149152
Calls 'update_graph' which post the query to update the similiar nodes in the graph
150153
"""
151154

152-
result = await asyncio.to_thread(update_graph)
155+
result = await asyncio.to_thread(update_graph,uri,userName,password,database)
153156
return result
154157

155158
def decode_password(pwd):

backend/src/diffbot_transformer.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import List
66
import os
77
import logging
8+
import uuid
89

910
logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
1011
def extract_graph_from_diffbot(graph: Neo4jGraph,
@@ -31,16 +32,21 @@ def extract_graph_from_diffbot(graph: Neo4jGraph,
3132
graph_document_list = []
3233

3334
logging.info(f"create relationship between source,chunk and entity nodes created from Diffbot")
35+
current_chunk_id = ''
36+
relationship_cypher_list = []
3437
for i,chunk in enumerate(chunks):
38+
previous_chunk_id = current_chunk_id
39+
current_chunk_id = str(uuid.uuid1())
3540
if i == 0:
3641
firstChunk = True
3742
else:
3843
firstChunk = False
3944
graph_document = diffbot_nlp.convert_to_graph_documents([chunk])
4045
graph.add_graph_documents(graph_document)
41-
create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk)
42-
graph_document_list.append(graph_document[0])
46+
lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk,current_chunk_id,
47+
previous_chunk_id)
48+
graph_document_list.append(graph_document[0])
49+
relationship_cypher_list.extend(lst_cypher_queries_chunk_relationship)
4350

4451
graph.refresh_schema()
45-
return graph_document_list
46-
52+
return graph_document_list, relationship_cypher_list

backend/src/main.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -362,18 +362,19 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
362362

363363
logging.info("Get graph document list from models")
364364
if model == 'Diffbot' :
365-
graph_documents = extract_graph_from_diffbot(graph,chunks,file_name,uri,userName,password)
365+
graph_documents, cypher_list = extract_graph_from_diffbot(graph,chunks,file_name,uri,userName,password)
366366

367367
elif model == 'OpenAI GPT 3.5':
368368
model_version = 'gpt-3.5-turbo-16k'
369-
graph_documents = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
369+
graph_documents, cypher_list = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
370370

371371
elif model == 'OpenAI GPT 4':
372372
model_version = 'gpt-4-0125-preview'
373-
graph_documents = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
373+
graph_documents, cypher_list = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
374374

375-
#update_similarity_graph for the KNN Graph
376-
update_graph(graph)
375+
#create relation between chunks (FIRST_CHUNK and NEXT_CHUNK)
376+
for query in cypher_list:
377+
graph.query(query)
377378

378379
distinct_nodes = set()
379380
relations = []
@@ -498,15 +499,15 @@ def get_source_list_from_graph(uri,userName,password,db_name=None):
498499
logging.exception(f'Exception:{error_message}')
499500
return create_api_response(job_status,message=message,error=error_message)
500501

501-
def update_graph(graph):
502+
def update_graph(uri,userName,password,db_name):
502503
"""
503504
Update the graph node with SIMILAR relationship where embedding scrore match
504505
"""
505506
try:
506507
knn_min_score = os.environ.get('KNN_MIN_SCORE')
507508

508509
query = "WHERE node <> c and score >= {} MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score"
509-
# graph = Neo4jGraph()
510+
graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
510511
result = graph.query("""MATCH (c:Chunk)
511512
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
512513
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score """+ query.format(knn_min_score))

backend/src/make_relationships.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ def create_source_chunk_entity_relationship(source_file_name :str,
1515
uri : str,
1616
userName : str,
1717
password : str,
18-
isFirstChunk : bool):
18+
isFirstChunk : bool,
19+
current_chunk_id:uuid,
20+
previous_chunk_id:uuid)-> list:
1921
""" Create relationship between source, chunk and entity nodes
2022
Args:
2123
source_file_name (str): file name of input source
@@ -26,14 +28,16 @@ def create_source_chunk_entity_relationship(source_file_name :str,
2628
userName: Username to use for graph creation ( if None will use username from config file )
2729
password: Password to use for graph creation ( if None will use password from config file )
2830
isFirstChunk : It's bool value to create FIRST_CHUNK AND NEXT_CHUNK relationship between chunk and document node.
31+
current_chunk_id : Unique id of chunk
32+
previous_chunk_id : Unique id of previous chunk
2933
"""
3034
source_node = 'fileName: "{}"'
35+
lst_cypher_queries_chunk_relationship = []
3136
# logging.info(f'Graph Document print{graph_document}')
3237
# openai_api_key = os.environ.get('OPENAI_API_KEY')
3338
embedding_model = os.environ.get('EMBEDDING_MODEL')
3439
isEmbedding = os.environ.get('IS_EMBEDDING')
3540

36-
chunk_uuid = str(uuid.uuid1())
3741
chunk_node_id_set = 'id:"{}"'
3842
update_chunk_node_prop = ' SET c.text = "{}"'
3943
if isEmbedding:
@@ -43,27 +47,29 @@ def create_source_chunk_entity_relationship(source_file_name :str,
4347
url=uri,
4448
username=userName,
4549
password=password,
46-
ids=[chunk_uuid]
50+
ids=[current_chunk_id]
4751
)
4852
else:
49-
graph.query('CREATE(c:Chunk {id:"'+ chunk_uuid+'"})' + update_chunk_node_prop.format(chunk.page_content))
53+
graph.query('MERGE(c:Chunk {id:"'+ current_chunk_id+'"})' + update_chunk_node_prop.format(chunk.page_content))
5054

5155
logging.info("make PART_OF relationship between chunk node and document node")
52-
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (c)-[:PART_OF]->(d)')
56+
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (c)-[:PART_OF]->(d)')
5357

54-
logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
58+
# logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
5559
if isFirstChunk:
56-
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
60+
lst_cypher_queries_chunk_relationship.append('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (d)-[:FIRST_CHUNK]->(c)')
61+
# graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
5762
else:
58-
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (d)-[:NEXT_CHUNK]->(c)')
63+
lst_cypher_queries_chunk_relationship.append('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (pc)-[:NEXT_CHUNK]->(cc)')
64+
# graph.query('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (pc)-[:NEXT_CHUNK]->(cc)')
5965
# dict = {}
6066
# nodes_list = []
6167
for node in graph_document[0].nodes:
6268
node_id = node.id
63-
result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) CREATE (c)-[:HAS_ENTITY]->(n)')
69+
result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
6470
# json_obj = {'node_id': node_id, 'node_type' : node.type, 'uuid' : chunk_uuid}
6571
# nodes_list.append(json_obj)
66-
72+
return lst_cypher_queries_chunk_relationship
6773
# dict['chunk_doc'] = chunk.page_content
6874
# dict['rel_chunk_entity_node'] = nodes_list
6975
# dict['nodes_created_in_chunk'] = len(graph_document[0].nodes)

backend/src/openAI_llm.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import concurrent.futures
2626
from concurrent.futures import ThreadPoolExecutor
2727
import threading
28+
import uuid
2829

2930
load_dotenv()
3031
logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
@@ -190,6 +191,13 @@ def extract_and_store_graph(
190191
model_version,
191192
graph: Neo4jGraph,
192193
document: Document,
194+
file_name: str,
195+
uri: str,
196+
userName:str,
197+
password:str,
198+
firstChunk:bool,
199+
current_chunk_id:uuid,
200+
previous_chunk_id:uuid,
193201
nodes:Optional[List[str]] = None,
194202
rels:Optional[List[str]]=None) -> None:
195203

@@ -198,10 +206,18 @@ def extract_and_store_graph(
198206
store the result into a Neo4jGraph.
199207
200208
Args:
201-
graph: Neo4j graph to store the data into
202-
document: Langchain document to extract data from
203-
nodes: List of nodes to extract ( default : None )
204-
rels: List of relationships to extract ( default : None )
209+
model_version: LLM model version
210+
graph: Neo4j graph to store the data into
211+
document: Langchain document to extract data from
212+
file_name (str): file name of input source
213+
uri: URI of the graph to extract
214+
userName: Username to use for graph creation ( if None will use username from config file )
215+
password: Password to use for graph creation ( if None will use password from config file )
216+
firstChunk : It's bool value to create FIRST_CHUNK AND NEXT_CHUNK relationship between chunk and document node.
217+
current_chunk_id : Unique id of chunk
218+
previous_chunk_id : Unique id of previous chunk
219+
nodes: List of nodes to extract ( default : None )
220+
rels: List of relationships to extract ( default : None )
205221
206222
Returns:
207223
The GraphDocument that was extracted and stored into the Neo4jgraph
@@ -221,7 +237,9 @@ def extract_and_store_graph(
221237
)]
222238

223239
graph.add_graph_documents(graph_document)
224-
return graph_document
240+
lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,document,uri,userName,password,firstChunk,current_chunk_id,
241+
previous_chunk_id)
242+
return graph_document, lst_cypher_queries_chunk_relationship
225243

226244

227245
def extract_graph_from_OpenAI(model_version,
@@ -248,19 +266,28 @@ def extract_graph_from_OpenAI(model_version,
248266
"""
249267
openai_api_key = os.environ.get('OPENAI_API_KEY')
250268
graph_document_list = []
269+
relationship_cypher_list = []
251270
futures=[]
252271
logging.info(f"create relationship between source,chunk and entity nodes created from {model_version}")
253272

254273
with ThreadPoolExecutor(max_workers=10) as executor:
274+
current_chunk_id= ''
255275
for i, chunk_document in tqdm(enumerate(chunks), total=len(chunks)):
276+
previous_chunk_id = current_chunk_id
277+
current_chunk_id = str(uuid.uuid1())
278+
position = i+1
256279
if i == 0:
257280
firstChunk = True
258281
else:
259282
firstChunk = False
260-
futures.append(executor.submit(extract_and_store_graph,model_version,graph,chunk_document))
283+
metadata = {"position": position,"length": len(chunk_document.page_content)}
284+
chunk_document = Document(page_content=chunk_document.page_content,metadata = metadata)
285+
286+
futures.append(executor.submit(extract_and_store_graph,model_version,graph,chunk_document,file_name,uri,userName,password,firstChunk,current_chunk_id,previous_chunk_id))
261287
for future in concurrent.futures.as_completed(futures):
262-
graph_document = future.result()
263-
create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk_document,uri,userName,password,firstChunk)
288+
graph_document,lst_cypher_queries_chunk_relationship = future.result()
289+
264290
graph_document_list.append(graph_document[0])
291+
relationship_cypher_list.extend(lst_cypher_queries_chunk_relationship)
265292

266-
return graph_document_list
293+
return graph_document_list, relationship_cypher_list

0 commit comments

Comments
 (0)