Skip to content

Commit 9d2dbf1

Browse files
authored
support checksum
1 parent 2375e70 commit 9d2dbf1

File tree

1 file changed

+138
-91
lines changed

1 file changed

+138
-91
lines changed

app/backend/prepdocslib/searchmanager.py

Lines changed: 138 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import asyncio
2+
import datetime
3+
import dateutil.parser as parser
24
import logging
35
import os
4-
from typing import List, Optional
6+
from typing import Dict, List, Optional
57

68
from azure.search.documents.indexes.models import (
79
AzureOpenAIVectorizer,
@@ -70,92 +72,107 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
7072
logger.info("Checking whether search index %s exists...", self.search_info.index_name)
7173

7274
async with self.search_info.create_search_index_client() as search_index_client:
73-
74-
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
75-
logger.info("Creating new search index %s", self.search_info.index_name)
76-
fields = [
77-
(
78-
SimpleField(name="id", type="Edm.String", key=True)
79-
if not self.use_int_vectorization
80-
else SearchField(
81-
name="id",
75+
fields = [
76+
(
77+
SimpleField(name="id", type="Edm.String", key=True)
78+
if not self.use_int_vectorization
79+
else SearchField(
80+
name="id",
81+
type="Edm.String",
82+
key=True,
83+
sortable=True,
84+
filterable=True,
85+
facetable=True,
86+
analyzer_name="keyword",
87+
)
88+
),
89+
SearchableField(
90+
name="content",
91+
type="Edm.String",
92+
analyzer_name=self.search_analyzer_name,
93+
),
94+
SearchField(
95+
name="embedding",
96+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
97+
hidden=False,
98+
searchable=True,
99+
filterable=False,
100+
sortable=False,
101+
facetable=False,
102+
vector_search_dimensions=self.embedding_dimensions,
103+
vector_search_profile_name="embedding_config",
104+
),
105+
SimpleField(name="category",
82106
type="Edm.String",
83-
key=True,
84-
sortable=True,
85107
filterable=True,
86-
facetable=True,
87-
analyzer_name="keyword",
88-
)
89-
),
90-
SearchableField(
91-
name="content",
92-
type="Edm.String",
93-
analyzer_name=self.search_analyzer_name,
94-
),
108+
facetable=True),
109+
SimpleField(name="md5",
110+
type="Edm.String",
111+
filterable=True,
112+
facetable=True),
113+
SimpleField(name="deeplink",
114+
type="Edm.String",
115+
filterable=True,
116+
facetable=False),
117+
SimpleField(name="updated",
118+
type="Edm.DateTimeOffset",
119+
filterable=True,
120+
facetable=True),
121+
SimpleField(
122+
name="sourcepage",
123+
type="Edm.String",
124+
filterable=True,
125+
facetable=True,
126+
),
127+
SimpleField(
128+
name="sourcefile",
129+
type="Edm.String",
130+
filterable=True,
131+
facetable=True,
132+
),
133+
SimpleField(
134+
name="storageUrl",
135+
type="Edm.String",
136+
filterable=True,
137+
facetable=False,
138+
),
139+
]
140+
if self.use_acls:
141+
fields.append(
142+
SimpleField(
143+
name="oids",
144+
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
145+
filterable=True,
146+
)
147+
)
148+
fields.append(
149+
SimpleField(
150+
name="groups",
151+
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
152+
filterable=True,
153+
)
154+
)
155+
if self.use_int_vectorization:
156+
logger.info("Including parent_id field in new index %s", self.search_info.index_name)
157+
fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True))
158+
if self.search_images:
159+
logger.info("Including imageEmbedding field in new index %s", self.search_info.index_name)
160+
fields.append(
95161
SearchField(
96-
name="embedding",
162+
name="imageEmbedding",
97163
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
98164
hidden=False,
99165
searchable=True,
100166
filterable=False,
101167
sortable=False,
102168
facetable=False,
103-
vector_search_dimensions=self.embedding_dimensions,
169+
vector_search_dimensions=1024,
104170
vector_search_profile_name="embedding_config",
105171
),
106-
SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
107-
SimpleField(
108-
name="sourcepage",
109-
type="Edm.String",
110-
filterable=True,
111-
facetable=True,
112-
),
113-
SimpleField(
114-
name="sourcefile",
115-
type="Edm.String",
116-
filterable=True,
117-
facetable=True,
118-
),
119-
SimpleField(
120-
name="storageUrl",
121-
type="Edm.String",
122-
filterable=True,
123-
facetable=False,
124-
),
125-
]
126-
if self.use_acls:
127-
fields.append(
128-
SimpleField(
129-
name="oids",
130-
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
131-
filterable=True,
132-
)
133-
)
134-
fields.append(
135-
SimpleField(
136-
name="groups",
137-
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
138-
filterable=True,
139-
)
140-
)
141-
if self.use_int_vectorization:
142-
logger.info("Including parent_id field in new index %s", self.search_info.index_name)
143-
fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True))
144-
if self.search_images:
145-
logger.info("Including imageEmbedding field in new index %s", self.search_info.index_name)
146-
fields.append(
147-
SearchField(
148-
name="imageEmbedding",
149-
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
150-
hidden=False,
151-
searchable=True,
152-
filterable=False,
153-
sortable=False,
154-
facetable=False,
155-
vector_search_dimensions=1024,
156-
vector_search_profile_name="embedding_config",
157-
),
158-
)
172+
)
173+
174+
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
175+
logger.info("Creating new search index %s", self.search_info.index_name)
159176

160177
vectorizers = []
161178
if self.embeddings and isinstance(self.embeddings, AzureOpenAIEmbeddingService):
@@ -217,16 +234,13 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
217234
else:
218235
logger.info("Search index %s already exists", self.search_info.index_name)
219236
existing_index = await search_index_client.get_index(self.search_info.index_name)
220-
if not any(field.name == "storageUrl" for field in existing_index.fields):
221-
logger.info("Adding storageUrl field to index %s", self.search_info.index_name)
222-
existing_index.fields.append(
223-
SimpleField(
224-
name="storageUrl",
225-
type="Edm.String",
226-
filterable=True,
227-
facetable=False,
228-
),
229-
)
237+
existing_field_names = {field.name for field in existing_index.fields}
238+
239+
# Check and add missing fields
240+
missing_fields = [field for field in fields if field.name not in existing_field_names]
241+
if missing_fields:
242+
logger.info("Adding missing fields to index %s: %s", self.search_info.index_name, [field.name for field in missing_fields])
243+
existing_index.fields.extend(missing_fields)
230244
await search_index_client.create_or_update_index(existing_index)
231245

232246
if existing_index.vector_search is not None and (
@@ -252,19 +266,52 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
252266
self.search_info,
253267
)
254268

269+
async def file_exists(self, file : File ) -> bool:
270+
async with self.search_info.create_search_client() as search_client:
271+
## make sure that we don't update unchanged sections, by if sourcefile and md5 are the same
272+
if file.metadata.get('md5')!= None:
273+
filter = None
274+
assert file.filename() is not None
275+
filter = f"sourcefile eq '{str(file.filename())}' and md5 eq '{file.metadata.get('md5')}'"
276+
277+
# make sure (when applicable) that we don't skip if different categories have same file.filename()
278+
#TODO: refactoring: check if using file.filename() as primary for blob is a good idea, or better use sha256(instead as md5) as reliable for blob and index primary key
279+
if file.metadata.get('category') is not None:
280+
filter = filter + f" and category eq '{file.metadata.get('category')}'"
281+
max_results = 1
282+
result = await search_client.search(
283+
search_text="", filter=filter, top=max_results, include_total_count=True
284+
)
285+
result_count = await result.get_count()
286+
if result_count > 0:
287+
logger.debug("Skipping %s, no changes detected.", file.filename())
288+
return True
289+
else:
290+
return False
291+
## -- end of check
292+
255293
async def update_content(
256-
self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None, url: Optional[str] = None
257-
):
294+
self, sections: List[Section], file : File ,image_embeddings: Optional[List[List[float]]] = None):
258295
MAX_BATCH_SIZE = 1000
259296
section_batches = [sections[i : i + MAX_BATCH_SIZE] for i in range(0, len(sections), MAX_BATCH_SIZE)]
260297

261298
async with self.search_info.create_search_client() as search_client:
299+
300+
## caluclate a (default) updated timestamp in format of index
301+
if file.metadata.get('updated') is None:
302+
docdate = datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
303+
else:
304+
docdate = parser.isoparse(file.metadata.get('updated')).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
305+
262306
for batch_index, batch in enumerate(section_batches):
263307
documents = [
264308
{
265309
"id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
266310
"content": section.split_page.text,
267-
"category": section.category,
311+
"category": file.metadata.get('category'),
312+
"md5": file.metadata.get('md5'),
313+
"deeplink": file.metadata.get('deeplink'), # optional deel link original doc source for citiation,inline view
314+
"updated": docdate,
268315
"sourcepage": (
269316
BlobManager.blob_image_name_from_file_page(
270317
filename=section.content.filename(),
@@ -281,9 +328,9 @@ async def update_content(
281328
}
282329
for section_index, section in enumerate(batch)
283330
]
284-
if url:
331+
if file.url:
285332
for document in documents:
286-
document["storageUrl"] = url
333+
document["storageUrl"] = file.url
287334
if self.embeddings:
288335
embeddings = await self.embeddings.create_embeddings(
289336
texts=[section.split_page.text for section in batch]

0 commit comments

Comments
 (0)