99import os
1010import uuid
1111import hashlib
12+ import time
13+ from langchain_neo4j import Neo4jVector
1214
1315logging .basicConfig (format = '%(asctime)s - %(message)s' ,level = 'INFO' )
1416
@@ -41,28 +43,8 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
4143 """
4244 graph .query (unwind_query , params = {"batch_data" : batch_data })
4345
44- graph .query ('MATCH(c:Chunk {' + chunk_node_id_set .format (graph_doc_chunk_id ['chunk_id' ])+ '}) MERGE (n:' + node .type + '{ id: "' + node_id + '"}) MERGE (c)-[:HAS_ENTITY]->(n)' )
45-
46- def load_embedding_model (embedding_model_name : str ):
47- if embedding_model_name == "openai" :
48- embeddings = OpenAIEmbeddings ()
49- dimension = 1536
50- logging .info ("Embedding: Using OpenAI" )
51- elif embedding_model_name == "vertexai" :
52- embeddings = VertexAIEmbeddings (
53- model = "textembedding-gecko@003"
54- )
55- dimension = 768
56- logging .info ("Embedding: Using Vertex AI Embeddings" )
57- else :
58- embeddings = SentenceTransformerEmbeddings (
59- model_name = "all-MiniLM-L6-v2" #, cache_folder="/embedding_model"
60- )
61- dimension = 384
62- logging .info ("Embedding: Using SentenceTransformer" )
63- return embeddings , dimension
64-
65- def update_embedding_create_vector_index (graph , chunkId_chunkDoc_list , file_name ):
46+
47+ def create_chunk_embeddings (graph , chunkId_chunkDoc_list , file_name ):
6648 #create embedding
6749 isEmbedding = os .getenv ('IS_EMBEDDING' )
6850 # embedding_model = os.getenv('EMBEDDING_MODEL')
@@ -79,35 +61,6 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
7961 "chunkId" : row ['chunk_id' ],
8062 "embeddings" : embeddings_arr
8163 })
82- # graph.query("""MATCH (d:Document {fileName : $fileName})
83- # MERGE (c:Chunk {id:$chunkId}) SET c.embedding = $embeddings
84- # MERGE (c)-[:PART_OF]->(d)
85- # """,
86- # {
87- # "fileName" : file_name,
88- # "chunkId": row['chunk_id'],
89- # "embeddings" : embeddings_arr
90- # }
91- # )
92- logging .info ('create vector index on chunk embedding' )
93- # result = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and name = 'vector'")
94- vector_index = graph .query ("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and type = 'VECTOR' AND name = 'vector' return options" )
95- # if result:
96- # logging.info(f"vector index dropped for 'Chunk'")
97- # graph.query("DROP INDEX vector IF EXISTS;")
98-
99- if len (vector_index ) == 0 :
100- logging .info (f'vector index is not exist, will create in next query' )
101- graph .query ("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
102- OPTIONS {indexConfig: {
103- `vector.dimensions`: $dimensions,
104- `vector.similarity_function`: 'cosine'
105- }}
106- """ ,
107- {
108- "dimensions" : dimension
109- }
110- )
11164
11265 query_to_create_embedding = """
11366 UNWIND $data AS row
@@ -214,4 +167,27 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
214167 """
215168 graph .query (query_to_create_NEXT_CHUNK_relation , params = {"relationships" : relationships })
216169
217- return lst_chunks_including_hash
170+ return lst_chunks_including_hash
171+
172+
173+ def create_chunk_vector_index (graph ):
174+ start_time = time .time ()
175+ try :
176+ vector_index = graph .query ("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and type = 'VECTOR' AND name = 'vector' return options" )
177+
178+ if not vector_index :
179+ vector_store = Neo4jVector (embedding = EMBEDDING_FUNCTION ,
180+ graph = graph ,
181+ node_label = "Chunk" ,
182+ embedding_node_property = "embedding" ,
183+ index_name = "vector_index"
184+ )
185+ vector_store .create_new_index ()
186+ logging .info (f"Index created successfully. Time taken: { time .time () - start_time :.2f} seconds" )
187+ else :
188+ logging .info (f"Index already exist,Skipping creation. Time taken: { time .time () - start_time :.2f} seconds" )
189+ except Exception as e :
190+ if "EquivalentSchemaRuleAlreadyExists" in str (e ):
191+ logging .info ("Vector index already exists, skipping creation." )
192+ else :
193+ raise
0 commit comments