Handled EquivalentSchemaRuleAlreadyExist due to race condition (#949)

kaustubh-darekar · kartikpersistent · commit 82dee71889f2 · 2025-01-28T15:48:13.000Z
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -338,6 +338,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
   logging.info(f'Time taken database connection: {elapsed_create_connection:.2f} seconds')
   uri_latency["create_connection"] = f'{elapsed_create_connection:.2f}'
   graphDb_data_Access = graphDBdataAccess(graph)
+  create_chunk_vector_index(graph)
   start_get_chunkId_chunkDoc_list = time.time()
   total_chunks, chunkId_chunkDoc_list = get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition)
   end_get_chunkId_chunkDoc_list = time.time()
@@ -482,7 +483,7 @@ async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password,
     graph = create_graph_database_connection(uri, userName, password, database)
   
   start_update_embedding = time.time()
-  update_embedding_create_vector_index( graph, chunkId_chunkDoc_list, file_name)
+  create_chunk_embeddings( graph, chunkId_chunkDoc_list, file_name)
   end_update_embedding = time.time()
   elapsed_update_embedding = end_update_embedding - start_update_embedding
   logging.info(f'Time taken to update embedding in chunk node: {elapsed_update_embedding:.2f} seconds')
diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py
@@ -9,6 +9,8 @@
 import os
 import uuid
 import hashlib
+import time
+from langchain_neo4j import Neo4jVector
 
 logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
 
@@ -41,28 +43,8 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
                 """
         graph.query(unwind_query, params={"batch_data": batch_data})
 
-            graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
-
-def load_embedding_model(embedding_model_name: str):
-    if embedding_model_name == "openai":
-        embeddings = OpenAIEmbeddings()
-        dimension = 1536
-        logging.info("Embedding: Using OpenAI")
-    elif embedding_model_name == "vertexai":        
-        embeddings = VertexAIEmbeddings(
-            model="textembedding-gecko@003"
-        )
-        dimension = 768
-        logging.info("Embedding: Using Vertex AI Embeddings")
-    else:
-        embeddings = SentenceTransformerEmbeddings(
-            model_name="all-MiniLM-L6-v2"#, cache_folder="/embedding_model"
-        )
-        dimension = 384
-        logging.info("Embedding: Using SentenceTransformer")
-    return embeddings, dimension
-
-def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name):
+    
+def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name):
     #create embedding
     isEmbedding = os.getenv('IS_EMBEDDING')
     # embedding_model = os.getenv('EMBEDDING_MODEL')
@@ -79,35 +61,6 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
                 "chunkId": row['chunk_id'],
                 "embeddings": embeddings_arr
             })
-            # graph.query("""MATCH (d:Document {fileName : $fileName})
-            #                MERGE (c:Chunk {id:$chunkId}) SET c.embedding = $embeddings 
-            #                MERGE (c)-[:PART_OF]->(d)
-            #             """,
-            #             {
-            #                 "fileName" : file_name,
-            #                 "chunkId": row['chunk_id'],
-            #                 "embeddings" : embeddings_arr
-            #             }
-            #             )
-            logging.info('create vector index on chunk embedding')
-            # result = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and name = 'vector'")
-            vector_index = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and type = 'VECTOR' AND name = 'vector' return options")
-            # if result:
-            #     logging.info(f"vector index dropped for 'Chunk'")
-            #     graph.query("DROP INDEX vector IF EXISTS;")
-
-            if len(vector_index) == 0:
-                logging.info(f'vector index is not exist, will create in next query')
-                graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
-                                OPTIONS {indexConfig: {
-                                `vector.dimensions`: $dimensions,
-                                `vector.similarity_function`: 'cosine'
-                                }}
-                            """,
-                            {
-                                "dimensions" : dimension
-                            }
-                            )
     
     query_to_create_embedding = """
         UNWIND $data AS row
@@ -214,4 +167,27 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
         """
     graph.query(query_to_create_NEXT_CHUNK_relation, params={"relationships": relationships})   
     
-    return lst_chunks_including_hash
+    return lst_chunks_including_hash
+
+
+def create_chunk_vector_index(graph):
+    start_time = time.time()
+    try:
+        vector_index = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and type = 'VECTOR' AND name = 'vector' return options")
+
+        if not vector_index:
+            vector_store = Neo4jVector(embedding=EMBEDDING_FUNCTION,
+                                    graph=graph,
+                                    node_label="Chunk", 
+                                    embedding_node_property="embedding",
+                                    index_name="vector_index"
+                                    )
+            vector_store.create_new_index()
+            logging.info(f"Index created successfully. Time taken: {time.time() - start_time:.2f} seconds")
+        else:
+            logging.info(f"Index already exist,Skipping creation. Time taken: {time.time() - start_time:.2f} seconds")
+    except Exception as e:
+        if "EquivalentSchemaRuleAlreadyExists" in str(e):
+            logging.info("Vector index already exists, skipping creation.")
+        else:
+            raise