neo4j-labs
diff --git a/‎backend/src/diffbot_transformer.py‎
Lines changed: 3 additions & 0 deletions b/‎backend/src/diffbot_transformer.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backend/src/main.py‎
Lines changed: 25 additions & 33 deletions b/‎backend/src/main.py‎
Lines changed: 25 additions & 33 deletions
diff --git a/‎backend/src/make_relationships.py‎
Lines changed: 28 additions & 14 deletions b/‎backend/src/make_relationships.py‎
Lines changed: 28 additions & 14 deletions
diff --git a/‎experiments/Combined chunk comparision.png‎
33.4 KB b/‎experiments/Combined chunk comparision.png‎
33.4 KB
@@ -37,10 +37,13 @@ def extract_graph_from_diffbot(graph: Neo4jGraph,
     for i,chunk in enumerate(chunks):
         previous_chunk_id = current_chunk_id
         current_chunk_id = str(uuid.uuid1())
+        position = i+1
         if i == 0:
             firstChunk = True
         else:
             firstChunk = False
+        metadata = {"position": position,"length": len(chunk.page_content)}
+        chunk = Document(page_content=chunk.page_content,metadata = metadata)
         graph_document = diffbot_nlp.convert_to_graph_documents([chunk])
         graph.add_graph_documents(graph_document)
         lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk,current_chunk_id,
 
@@ -33,9 +33,8 @@
 def update_exception_db(graph_obj,file_name,exp_msg):
   try:  
     job_status = "Failed"
-    source_node = "fileName: '{}'"
-    update_node_prop = 'SET d.status = "{}", d.errorMessage = "{}"'
-    graph_obj.query('MERGE(d:Document {'+source_node.format(file_name)+'}) '+update_node_prop.format(job_status,exp_msg))
+    graph_obj.query("""MERGE(d:Document {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
+                    {"fName":file_name, "status":job_status, "error_msg":exp_msg})
   except Exception as e:
     error_message = str(e)
     logging.error(f"Error in updating document node status as failed: {error_message}")
@@ -45,10 +44,15 @@ def create_source_node(graph_obj,file_name,file_size,file_type,source,model,url=
   try:   
     current_time = datetime.now()
     job_status = "New"
-    source_node = "fileName: '{}'"
-    update_node_prop = "SET d.fileSize = '{}', d.fileType = '{}' ,d.status = '{}',d.url='{}',d.awsAccessKeyId='{}',d.fileSource='{}', d.createdAt ='{}', d.updatedAt = '{}', d.processingTime = '{}', d.errorMessage = '{}', d.nodeCount= {}, d.relationshipCount = {}, d.model= '{}'"
     logging.info("create source node as file name if not exist")
-    graph_obj.query('MERGE(d:Document {'+source_node.format(file_name)+'}) '+update_node_prop.format(file_size,file_type,job_status,url,aws_access_key_id,source,current_time,current_time,0,'',0,0,model))
+    graph_obj.query("""MERGE(d:Document {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
+                    d.status = $st, d.url = $url, d.awsAccessKeyId = $awsacc_key_id, 
+                    d.fileSource = $f_source, d.createdAt = $c_at, d.updatedAt = $u_at, 
+                    d.processingTime = $pt, d.errorMessage = $e_message, d.nodeCount= $n_count, 
+                    d.relationshipCount = $r_count, d.model= $model""",
+                    {"fn":file_name, "fs":file_size, "ft":file_type, "st":job_status, "url":url,
+                     "awsacc_key_id":aws_access_key_id, "f_source":source, "c_at":current_time,
+                     "u_at":current_time, "pt":0, "e_message":'', "n_count":0, "r_count":0, "model":model})
   except Exception as e:
     error_message = str(e)
     update_exception_db(graph_obj,file_name,error_message)
@@ -73,10 +77,6 @@ def create_source_node_graph_local_file(uri, userName, password, file, model, db
     file_size = file.size
     file_name = file.filename
     source = 'local file'
-    # if db_name is not None:
-    #   graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
-    # else:
-    #    graph = Neo4jGraph(url=uri, username=userName, password=password)   
     graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
     create_source_node(graph,file_name,file_size,file_type,source,model)
     return create_api_response("Success",message="Source Node created successfully",file_source=source)
@@ -177,10 +177,6 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
     """
     try:
         source_type,youtube_url = check_url_source(source_url)
-        # if db_name is not None:
-        #   graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
-        # else:
-        #   graph = Neo4jGraph(url=uri, username=userName, password=password)
         graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
         logging.info(f"source type URL:{source_type}")
         if source_type == "s3 bucket":
@@ -216,7 +212,6 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
             return create_api_response("Success",message="Source Node created successfully",success_count=success_count,Failed_count=Failed_count,file_source='s3 bucket',file_name=lst_s3_file_name)
         elif source_type == 'youtube':
             source_url= youtube_url
-           # match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", source_url)
             match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',source_url)
             logging.info(f"match value{match}")
             file_name = YouTube(source_url).title
@@ -266,8 +261,6 @@ def file_into_chunks(pages: List[Document]):
     logging.info("Split file into smaller chunks")
     text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
     chunks = text_splitter.split_documents(pages)
-    # print('Before chunks',len(chunks))
-    #chunks=chunks[:10]
     return chunks
 
 def get_s3_pdf_content(s3_url,aws_access_key_id=None,aws_secret_access_key=None):
@@ -324,14 +317,9 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
    	 Json response to API with fileName, nodeCount, relationshipCount, processingTime, 
      status and model as attributes.
   """
-  # logging.info(f"extract_graph_from_file called for file:{file.filename}")
   try:
     start_time = datetime.now()
     file_name = ''
-    # if db_name is not None:
-    #   graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
-    # else:
-    #    graph = Neo4jGraph(url=uri, username=userName, password=password) 
     graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
     source_node = "fileName: '{}'"
 
@@ -418,7 +406,15 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
     job_status = "Completed"
     error_message =""
     logging.info("Update source node properties")
-    graph.query('MERGE(d:Document {'+source_node.format(file_key.split('/')[-1])+'}) '+update_node_prop.format(start_time,end_time,round(processed_time.total_seconds(),2),job_status,error_message,nodes_created,relationships_created,model))
+    graph.query("""MERGE(d:Document {fileName :$fn}) SET d.status = $st, d.createdAt = $c_at, 
+                    d.updatedAt = $u_at, d.processingTime = $pt, d.nodeCount= $n_count, 
+                    d.relationshipCount = $r_count, d.model= $model
+                """,
+                {"fn":file_key.split('/')[-1], "st":job_status, "c_at":start_time,
+                  "u_at":end_time, "pt":round(processed_time.total_seconds(),2), "e_message":'',
+                  "n_count":nodes_created, "r_count":relationships_created, "model":model
+                }
+                )
 
     output = {
         "fileName": file_name,
@@ -502,12 +498,6 @@ def get_source_list_from_graph(uri,userName,password,db_name=None):
  """
   logging.info("Get existing files list from graph")
   try:
-    # if len(db_name)!=0:
-    #   logging.info(f"Fetching source list from, database = {db_name}")
-    #   graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
-    # else:
-    #    logging.info(f"Fetching source list from default database (neo4j)")
-    #    graph = Neo4jGraph(url=uri, username=userName, password=password)
     graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
     query = "MATCH(d:Document) RETURN d ORDER BY d.updatedAt DESC"
     result = graph.query(query)
@@ -526,12 +516,14 @@ def update_graph(uri,userName,password,db_name):
   """
   try:   
     knn_min_score = os.environ.get('KNN_MIN_SCORE')
-
-    query = "WHERE node <> c and score >= {} MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score"
     graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
     result = graph.query("""MATCH (c:Chunk)
-                WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
-                CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score """+ query.format(knn_min_score))
+                            WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
+                            CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score
+                            WHERE node <> c and score >= $score MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score
+                         """,
+                         {"score":knn_min_score}
+                         )
     logging.info(f"result : {result}")
   except Exception as e:
     error_message = str(e)
 
@@ -33,14 +33,11 @@ def create_source_chunk_entity_relationship(source_file_name :str,
     """
     source_node = 'fileName: "{}"'
     lst_cypher_queries_chunk_relationship = []
-    # logging.info(f'Graph Document print{graph_document}')
-    # openai_api_key = os.environ.get('OPENAI_API_KEY')
-    embedding_model = os.environ.get('EMBEDDING_MODEL')
-    isEmbedding = os.environ.get('IS_EMBEDDING')
-    
+    embedding_model = os.getenv('EMBEDDING_MODEL')
+    isEmbedding = os.getenv('IS_EMBEDDING')
     chunk_node_id_set = 'id:"{}"'
-    update_chunk_node_prop = ' SET c.text = "{}"'
-    if isEmbedding:
+    
+    if isEmbedding.upper() == "TRUE":
         Neo4jVector.from_documents(
             [chunk],
             OpenAIEmbeddings(model=embedding_model),
@@ -50,23 +47,40 @@ def create_source_chunk_entity_relationship(source_file_name :str,
             ids=[current_chunk_id]
         )
     else:
-        graph.query('MERGE(c:Chunk {id:"'+ current_chunk_id+'"})' + update_chunk_node_prop.format(chunk.page_content))
+        graph.query("""MERGE(c:Chunk {id : $id}) SET c.text = $pg_content, c.position = $position, 
+                    c.length = $length
+                    """,
+                    {"id":current_chunk_id,"pg_content":chunk.page_content, "position": chunk.metadata['position'],
+                     "length": chunk.metadata['length']
+                    })
 
     logging.info("make PART_OF relationship between chunk node and document node")
-    graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (c)-[:PART_OF]->(d)')
+    graph.query("""MATCH(d:Document {fileName : $f_name}) ,(c:Chunk {id : $chunk_id}) 
+                MERGE (c)-[:PART_OF]->(d)
+                """,
+                {"f_name":source_file_name,"chunk_id":current_chunk_id})
 
-    # logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
-    if isFirstChunk:
+    #FYI-Reason: To use the list below because some relationships are not creating due to chunks not existing because the function running in a thread (chunks creation async)
+    #relationship between chunks as NEXT_CHUNK, FIRST_CHUNK, these queries executed end of the file process.
+    #could not change the below query as parameterize because the list only takes a single parameter and parameterizes (2 parameters)
+    if isFirstChunk: 
         lst_cypher_queries_chunk_relationship.append('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (d)-[:FIRST_CHUNK]->(c)')
-        # graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
     else:
         lst_cypher_queries_chunk_relationship.append('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (pc)-[:NEXT_CHUNK]->(cc)')
-        # graph.query('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (pc)-[:NEXT_CHUNK]->(cc)')
     # dict = {}
     # nodes_list = []
     for node in graph_document[0].nodes:
         node_id = node.id
-        result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
+        #Below query is also unable to change as parametrize because we can't make parameter of Label or node type
+        #https://neo4j.com/docs/cypher-manual/current/syntax/parameters/
+
+        graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
+
+        # graph.query("""MATCH(c:Chunk {id : $chunk_id}), (n:$node_type{ id: $node_id}) 
+        #             MERGE (c)-[:HAS_ENTITY]->(n)
+        #             """,
+        #             {"chunk_id":current_chunk_id,"node_type":node.type, "node_id":node_id})
+
     #     json_obj = {'node_id': node_id, 'node_type' : node.type, 'uuid' : chunk_uuid}
     #     nodes_list.append(json_obj)
     return lst_cypher_queries_chunk_relationship