11import asyncio
2+ import datetime
3+ import dateutil .parser as parser
24import logging
35import os
4- from typing import List , Optional
6+ from typing import Dict , List , Optional
57
68from azure .search .documents .indexes .models import (
79 AzureOpenAIVectorizer ,
@@ -70,92 +72,107 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
7072 logger .info ("Checking whether search index %s exists..." , self .search_info .index_name )
7173
7274 async with self .search_info .create_search_index_client () as search_index_client :
73-
74- if self .search_info .index_name not in [name async for name in search_index_client .list_index_names ()]:
75- logger .info ("Creating new search index %s" , self .search_info .index_name )
76- fields = [
77- (
78- SimpleField (name = "id" , type = "Edm.String" , key = True )
79- if not self .use_int_vectorization
80- else SearchField (
81- name = "id" ,
75+ fields = [
76+ (
77+ SimpleField (name = "id" , type = "Edm.String" , key = True )
78+ if not self .use_int_vectorization
79+ else SearchField (
80+ name = "id" ,
81+ type = "Edm.String" ,
82+ key = True ,
83+ sortable = True ,
84+ filterable = True ,
85+ facetable = True ,
86+ analyzer_name = "keyword" ,
87+ )
88+ ),
89+ SearchableField (
90+ name = "content" ,
91+ type = "Edm.String" ,
92+ analyzer_name = self .search_analyzer_name ,
93+ ),
94+ SearchField (
95+ name = "embedding" ,
96+ type = SearchFieldDataType .Collection (SearchFieldDataType .Single ),
97+ hidden = False ,
98+ searchable = True ,
99+ filterable = False ,
100+ sortable = False ,
101+ facetable = False ,
102+ vector_search_dimensions = self .embedding_dimensions ,
103+ vector_search_profile_name = "embedding_config" ,
104+ ),
105+ SimpleField (name = "category" ,
82106 type = "Edm.String" ,
83- key = True ,
84- sortable = True ,
85107 filterable = True ,
86- facetable = True ,
87- analyzer_name = "keyword" ,
88- )
89- ),
90- SearchableField (
91- name = "content" ,
92- type = "Edm.String" ,
93- analyzer_name = self .search_analyzer_name ,
94- ),
108+ facetable = True ),
109+ SimpleField (name = "md5" ,
110+ type = "Edm.String" ,
111+ filterable = True ,
112+ facetable = True ),
113+ SimpleField (name = "deeplink" ,
114+ type = "Edm.String" ,
115+ filterable = True ,
116+ facetable = False ),
117+ SimpleField (name = "updated" ,
118+ type = "Edm.DateTimeOffset" ,
119+ filterable = True ,
120+ facetable = True ),
121+ SimpleField (
122+ name = "sourcepage" ,
123+ type = "Edm.String" ,
124+ filterable = True ,
125+ facetable = True ,
126+ ),
127+ SimpleField (
128+ name = "sourcefile" ,
129+ type = "Edm.String" ,
130+ filterable = True ,
131+ facetable = True ,
132+ ),
133+ SimpleField (
134+ name = "storageUrl" ,
135+ type = "Edm.String" ,
136+ filterable = True ,
137+ facetable = False ,
138+ ),
139+ ]
140+ if self .use_acls :
141+ fields .append (
142+ SimpleField (
143+ name = "oids" ,
144+ type = SearchFieldDataType .Collection (SearchFieldDataType .String ),
145+ filterable = True ,
146+ )
147+ )
148+ fields .append (
149+ SimpleField (
150+ name = "groups" ,
151+ type = SearchFieldDataType .Collection (SearchFieldDataType .String ),
152+ filterable = True ,
153+ )
154+ )
155+ if self .use_int_vectorization :
156+ logger .info ("Including parent_id field in new index %s" , self .search_info .index_name )
157+ fields .append (SearchableField (name = "parent_id" , type = "Edm.String" , filterable = True ))
158+ if self .search_images :
159+ logger .info ("Including imageEmbedding field in new index %s" , self .search_info .index_name )
160+ fields .append (
95161 SearchField (
96- name = "embedding " ,
162+ name = "imageEmbedding " ,
97163 type = SearchFieldDataType .Collection (SearchFieldDataType .Single ),
98164 hidden = False ,
99165 searchable = True ,
100166 filterable = False ,
101167 sortable = False ,
102168 facetable = False ,
103- vector_search_dimensions = self . embedding_dimensions ,
169+ vector_search_dimensions = 1024 ,
104170 vector_search_profile_name = "embedding_config" ,
105171 ),
106- SimpleField (name = "category" , type = "Edm.String" , filterable = True , facetable = True ),
107- SimpleField (
108- name = "sourcepage" ,
109- type = "Edm.String" ,
110- filterable = True ,
111- facetable = True ,
112- ),
113- SimpleField (
114- name = "sourcefile" ,
115- type = "Edm.String" ,
116- filterable = True ,
117- facetable = True ,
118- ),
119- SimpleField (
120- name = "storageUrl" ,
121- type = "Edm.String" ,
122- filterable = True ,
123- facetable = False ,
124- ),
125- ]
126- if self .use_acls :
127- fields .append (
128- SimpleField (
129- name = "oids" ,
130- type = SearchFieldDataType .Collection (SearchFieldDataType .String ),
131- filterable = True ,
132- )
133- )
134- fields .append (
135- SimpleField (
136- name = "groups" ,
137- type = SearchFieldDataType .Collection (SearchFieldDataType .String ),
138- filterable = True ,
139- )
140- )
141- if self .use_int_vectorization :
142- logger .info ("Including parent_id field in new index %s" , self .search_info .index_name )
143- fields .append (SearchableField (name = "parent_id" , type = "Edm.String" , filterable = True ))
144- if self .search_images :
145- logger .info ("Including imageEmbedding field in new index %s" , self .search_info .index_name )
146- fields .append (
147- SearchField (
148- name = "imageEmbedding" ,
149- type = SearchFieldDataType .Collection (SearchFieldDataType .Single ),
150- hidden = False ,
151- searchable = True ,
152- filterable = False ,
153- sortable = False ,
154- facetable = False ,
155- vector_search_dimensions = 1024 ,
156- vector_search_profile_name = "embedding_config" ,
157- ),
158- )
172+ )
173+
174+ if self .search_info .index_name not in [name async for name in search_index_client .list_index_names ()]:
175+ logger .info ("Creating new search index %s" , self .search_info .index_name )
159176
160177 vectorizers = []
161178 if self .embeddings and isinstance (self .embeddings , AzureOpenAIEmbeddingService ):
@@ -217,16 +234,13 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
217234 else :
218235 logger .info ("Search index %s already exists" , self .search_info .index_name )
219236 existing_index = await search_index_client .get_index (self .search_info .index_name )
220- if not any (field .name == "storageUrl" for field in existing_index .fields ):
221- logger .info ("Adding storageUrl field to index %s" , self .search_info .index_name )
222- existing_index .fields .append (
223- SimpleField (
224- name = "storageUrl" ,
225- type = "Edm.String" ,
226- filterable = True ,
227- facetable = False ,
228- ),
229- )
237+ existing_field_names = {field .name for field in existing_index .fields }
238+
239+ # Check and add missing fields
240+ missing_fields = [field for field in fields if field .name not in existing_field_names ]
241+ if missing_fields :
242+ logger .info ("Adding missing fields to index %s: %s" , self .search_info .index_name , [field .name for field in missing_fields ])
243+ existing_index .fields .extend (missing_fields )
230244 await search_index_client .create_or_update_index (existing_index )
231245
232246 if existing_index .vector_search is not None and (
@@ -252,19 +266,52 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
252266 self .search_info ,
253267 )
254268
269+ async def file_exists (self , file : File ) -> bool :
270+ async with self .search_info .create_search_client () as search_client :
271+ ## make sure that we don't update unchanged sections, by if sourcefile and md5 are the same
272+ if file .metadata .get ('md5' )!= None :
273+ filter = None
274+ assert file .filename () is not None
275+ filter = f"sourcefile eq '{ str (file .filename ())} ' and md5 eq '{ file .metadata .get ('md5' )} '"
276+
277+ # make sure (when applicable) that we don't skip if different categories have same file.filename()
278+ #TODO: refactoring: check if using file.filename() as primary for blob is a good idea, or better use sha256(instead as md5) as reliable for blob and index primary key
279+ if file .metadata .get ('category' ) is not None :
280+ filter = filter + f" and category eq '{ file .metadata .get ('category' )} '"
281+ max_results = 1
282+ result = await search_client .search (
283+ search_text = "" , filter = filter , top = max_results , include_total_count = True
284+ )
285+ result_count = await result .get_count ()
286+ if result_count > 0 :
287+ logger .debug ("Skipping %s, no changes detected." , file .filename ())
288+ return True
289+ else :
290+ return False
291+ ## -- end of check
292+
255293 async def update_content (
256- self , sections : List [Section ], image_embeddings : Optional [List [List [float ]]] = None , url : Optional [str ] = None
257- ):
294+ self , sections : List [Section ], file : File ,image_embeddings : Optional [List [List [float ]]] = None ):
258295 MAX_BATCH_SIZE = 1000
259296 section_batches = [sections [i : i + MAX_BATCH_SIZE ] for i in range (0 , len (sections ), MAX_BATCH_SIZE )]
260297
261298 async with self .search_info .create_search_client () as search_client :
299+
300+ ## caluclate a (default) updated timestamp in format of index
301+ if file .metadata .get ('updated' ) is None :
302+ docdate = datetime .now (datetime .timezone .utc ).strftime ('%Y-%m-%dT%H:%M:%S.%f' )[:- 3 ] + 'Z'
303+ else :
304+ docdate = parser .isoparse (file .metadata .get ('updated' )).strftime ('%Y-%m-%dT%H:%M:%S.%f' )[:- 3 ] + 'Z'
305+
262306 for batch_index , batch in enumerate (section_batches ):
263307 documents = [
264308 {
265309 "id" : f"{ section .content .filename_to_id ()} -page-{ section_index + batch_index * MAX_BATCH_SIZE } " ,
266310 "content" : section .split_page .text ,
267- "category" : section .category ,
311+ "category" : file .metadata .get ('category' ),
312+ "md5" : file .metadata .get ('md5' ),
313+ "deeplink" : file .metadata .get ('deeplink' ), # optional deel link original doc source for citiation,inline view
314+ "updated" : docdate ,
268315 "sourcepage" : (
269316 BlobManager .blob_image_name_from_file_page (
270317 filename = section .content .filename (),
@@ -281,9 +328,9 @@ async def update_content(
281328 }
282329 for section_index , section in enumerate (batch )
283330 ]
284- if url :
331+ if file . url :
285332 for document in documents :
286- document ["storageUrl" ] = url
333+ document ["storageUrl" ] = file . url
287334 if self .embeddings :
288335 embeddings = await self .embeddings .create_embeddings (
289336 texts = [section .split_page .text for section in batch ]
0 commit comments